From: Marek Olšák Date: Wed, 26 Aug 2020 05:02:46 +0000 (-0400) Subject: radeonsi: move si_upload_vertex_buffer_descriptors into si_state_draw.c X-Git-Tag: upstream/21.0.0~5116 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=8ab15c9e33a71923bb1b2c338544393b0e7bcffb;p=platform%2Fupstream%2Fmesa.git radeonsi: move si_upload_vertex_buffer_descriptors into si_state_draw.c It will be inlined there. Reviewed-by: Pierre-Eric Pelloux-Prayer Part-of: --- diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c b/src/gallium/drivers/radeonsi/si_descriptors.c index bbcaac1..e41ea6b 100644 --- a/src/gallium/drivers/radeonsi/si_descriptors.c +++ b/src/gallium/drivers/radeonsi/si_descriptors.c @@ -1073,109 +1073,6 @@ static void si_vertex_buffers_begin_new_cs(struct si_context *sctx) RADEON_PRIO_DESCRIPTORS); } -bool si_upload_vertex_buffer_descriptors(struct si_context *sctx) -{ - unsigned i, count = sctx->num_vertex_elements; - uint32_t *ptr; - - if (!sctx->vertex_buffers_dirty || !count) - return true; - - struct si_vertex_elements *velems = sctx->vertex_elements; - unsigned alloc_size = velems->vb_desc_list_alloc_size; - - if (alloc_size) { - /* Vertex buffer descriptors are the only ones which are uploaded - * directly through a staging buffer and don't go through - * the fine-grained upload path. - */ - u_upload_alloc(sctx->b.const_uploader, 0, alloc_size, - si_optimal_tcc_alignment(sctx, alloc_size), &sctx->vb_descriptors_offset, - (struct pipe_resource **)&sctx->vb_descriptors_buffer, (void **)&ptr); - if (!sctx->vb_descriptors_buffer) { - sctx->vb_descriptors_offset = 0; - sctx->vb_descriptors_gpu_list = NULL; - return false; - } - - sctx->vb_descriptors_gpu_list = ptr; - radeon_add_to_buffer_list(sctx, sctx->gfx_cs, sctx->vb_descriptors_buffer, RADEON_USAGE_READ, - RADEON_PRIO_DESCRIPTORS); - sctx->vertex_buffer_pointer_dirty = true; - sctx->prefetch_L2_mask |= SI_PREFETCH_VBO_DESCRIPTORS; - } else { - si_resource_reference(&sctx->vb_descriptors_buffer, NULL); - sctx->vertex_buffer_pointer_dirty = false; - sctx->prefetch_L2_mask &= ~SI_PREFETCH_VBO_DESCRIPTORS; - } - - assert(count <= SI_MAX_ATTRIBS); - - unsigned first_vb_use_mask = velems->first_vb_use_mask; - unsigned num_vbos_in_user_sgprs = sctx->screen->num_vbos_in_user_sgprs; - - for (i = 0; i < count; i++) { - struct pipe_vertex_buffer *vb; - struct si_resource *buf; - unsigned vbo_index = velems->vertex_buffer_index[i]; - uint32_t *desc = i < num_vbos_in_user_sgprs ? &sctx->vb_descriptor_user_sgprs[i * 4] - : &ptr[(i - num_vbos_in_user_sgprs) * 4]; - - vb = &sctx->vertex_buffer[vbo_index]; - buf = si_resource(vb->buffer.resource); - if (!buf) { - memset(desc, 0, 16); - continue; - } - - int64_t offset = (int64_t)((int)vb->buffer_offset) + velems->src_offset[i]; - - if (offset >= buf->b.b.width0) { - assert(offset < buf->b.b.width0); - memset(desc, 0, 16); - continue; - } - - uint64_t va = buf->gpu_address + offset; - - int64_t num_records = (int64_t)buf->b.b.width0 - offset; - if (sctx->chip_class != GFX8 && vb->stride) { - /* Round up by rounding down and adding 1 */ - num_records = (num_records - velems->format_size[i]) / vb->stride + 1; - } - assert(num_records >= 0 && num_records <= UINT_MAX); - - uint32_t rsrc_word3 = velems->rsrc_word3[i]; - - /* OOB_SELECT chooses the out-of-bounds check: - * - 1: index >= NUM_RECORDS (Structured) - * - 3: offset >= NUM_RECORDS (Raw) - */ - if (sctx->chip_class >= GFX10) - rsrc_word3 |= S_008F0C_OOB_SELECT(vb->stride ? V_008F0C_OOB_SELECT_STRUCTURED - : V_008F0C_OOB_SELECT_RAW); - - desc[0] = va; - desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32) | S_008F04_STRIDE(vb->stride); - desc[2] = num_records; - desc[3] = rsrc_word3; - - if (first_vb_use_mask & (1 << i)) { - radeon_add_to_buffer_list(sctx, sctx->gfx_cs, si_resource(vb->buffer.resource), - RADEON_USAGE_READ, RADEON_PRIO_VERTEX_BUFFER); - } - } - - /* Don't flush the const cache. It would have a very negative effect - * on performance (confirmed by testing). New descriptors are always - * uploaded to a fresh new buffer, so I don't think flushing the const - * cache is needed. */ - si_mark_atom_dirty(sctx, &sctx->atoms.s.shader_pointers); - sctx->vertex_buffer_user_sgprs_dirty = num_vbos_in_user_sgprs > 0; - sctx->vertex_buffers_dirty = false; - return true; -} - /* CONSTANT BUFFERS */ static struct si_descriptors *si_const_and_shader_buffer_descriptors(struct si_context *sctx, diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h index 4d42a40..9ac4d51 100644 --- a/src/gallium/drivers/radeonsi/si_state.h +++ b/src/gallium/drivers/radeonsi/si_state.h @@ -496,7 +496,6 @@ void si_set_ring_buffer(struct si_context *sctx, uint slot, struct pipe_resource unsigned stride, unsigned num_records, bool add_tid, bool swizzle, unsigned element_size, unsigned index_stride, uint64_t offset); void si_init_all_descriptors(struct si_context *sctx); -bool si_upload_vertex_buffer_descriptors(struct si_context *sctx); bool si_upload_graphics_shader_descriptors(struct si_context *sctx); bool si_upload_compute_shader_descriptors(struct si_context *sctx); void si_release_all_descriptors(struct si_context *sctx); diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c b/src/gallium/drivers/radeonsi/si_state_draw.c index 9c1b8e0..9a7f2f7 100644 --- a/src/gallium/drivers/radeonsi/si_state_draw.c +++ b/src/gallium/drivers/radeonsi/si_state_draw.c @@ -1410,6 +1410,109 @@ void si_emit_cache_flush(struct si_context *sctx) sctx->flags = 0; } +static bool si_upload_vertex_buffer_descriptors(struct si_context *sctx) +{ + unsigned i, count = sctx->num_vertex_elements; + uint32_t *ptr; + + if (!sctx->vertex_buffers_dirty || !count) + return true; + + struct si_vertex_elements *velems = sctx->vertex_elements; + unsigned alloc_size = velems->vb_desc_list_alloc_size; + + if (alloc_size) { + /* Vertex buffer descriptors are the only ones which are uploaded + * directly through a staging buffer and don't go through + * the fine-grained upload path. + */ + u_upload_alloc(sctx->b.const_uploader, 0, alloc_size, + si_optimal_tcc_alignment(sctx, alloc_size), &sctx->vb_descriptors_offset, + (struct pipe_resource **)&sctx->vb_descriptors_buffer, (void **)&ptr); + if (!sctx->vb_descriptors_buffer) { + sctx->vb_descriptors_offset = 0; + sctx->vb_descriptors_gpu_list = NULL; + return false; + } + + sctx->vb_descriptors_gpu_list = ptr; + radeon_add_to_buffer_list(sctx, sctx->gfx_cs, sctx->vb_descriptors_buffer, RADEON_USAGE_READ, + RADEON_PRIO_DESCRIPTORS); + sctx->vertex_buffer_pointer_dirty = true; + sctx->prefetch_L2_mask |= SI_PREFETCH_VBO_DESCRIPTORS; + } else { + si_resource_reference(&sctx->vb_descriptors_buffer, NULL); + sctx->vertex_buffer_pointer_dirty = false; + sctx->prefetch_L2_mask &= ~SI_PREFETCH_VBO_DESCRIPTORS; + } + + assert(count <= SI_MAX_ATTRIBS); + + unsigned first_vb_use_mask = velems->first_vb_use_mask; + unsigned num_vbos_in_user_sgprs = sctx->screen->num_vbos_in_user_sgprs; + + for (i = 0; i < count; i++) { + struct pipe_vertex_buffer *vb; + struct si_resource *buf; + unsigned vbo_index = velems->vertex_buffer_index[i]; + uint32_t *desc = i < num_vbos_in_user_sgprs ? &sctx->vb_descriptor_user_sgprs[i * 4] + : &ptr[(i - num_vbos_in_user_sgprs) * 4]; + + vb = &sctx->vertex_buffer[vbo_index]; + buf = si_resource(vb->buffer.resource); + if (!buf) { + memset(desc, 0, 16); + continue; + } + + int64_t offset = (int64_t)((int)vb->buffer_offset) + velems->src_offset[i]; + + if (offset >= buf->b.b.width0) { + assert(offset < buf->b.b.width0); + memset(desc, 0, 16); + continue; + } + + uint64_t va = buf->gpu_address + offset; + + int64_t num_records = (int64_t)buf->b.b.width0 - offset; + if (sctx->chip_class != GFX8 && vb->stride) { + /* Round up by rounding down and adding 1 */ + num_records = (num_records - velems->format_size[i]) / vb->stride + 1; + } + assert(num_records >= 0 && num_records <= UINT_MAX); + + uint32_t rsrc_word3 = velems->rsrc_word3[i]; + + /* OOB_SELECT chooses the out-of-bounds check: + * - 1: index >= NUM_RECORDS (Structured) + * - 3: offset >= NUM_RECORDS (Raw) + */ + if (sctx->chip_class >= GFX10) + rsrc_word3 |= S_008F0C_OOB_SELECT(vb->stride ? V_008F0C_OOB_SELECT_STRUCTURED + : V_008F0C_OOB_SELECT_RAW); + + desc[0] = va; + desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32) | S_008F04_STRIDE(vb->stride); + desc[2] = num_records; + desc[3] = rsrc_word3; + + if (first_vb_use_mask & (1 << i)) { + radeon_add_to_buffer_list(sctx, sctx->gfx_cs, si_resource(vb->buffer.resource), + RADEON_USAGE_READ, RADEON_PRIO_VERTEX_BUFFER); + } + } + + /* Don't flush the const cache. It would have a very negative effect + * on performance (confirmed by testing). New descriptors are always + * uploaded to a fresh new buffer, so I don't think flushing the const + * cache is needed. */ + si_mark_atom_dirty(sctx, &sctx->atoms.s.shader_pointers); + sctx->vertex_buffer_user_sgprs_dirty = num_vbos_in_user_sgprs > 0; + sctx->vertex_buffers_dirty = false; + return true; +} + static void si_get_draw_start_count(struct si_context *sctx, const struct pipe_draw_info *info, unsigned *start, unsigned *count) {