From 3360dbe0c1ebeead9046955b3de4dcdf86cfbb40 Mon Sep 17 00:00:00 2001 From: Dave Airlie Date: Mon, 13 Feb 2017 07:30:29 +0000 Subject: [PATCH] radv: fixup IA_MULTI_VGT_PARAM handling. This ports the remains of the workarounds from radeonsi for the non-TESS cases. It should provide equivalent workarounds for hawaii and bonarie. Reviewed-by: Bas Nieuwenhuizen Signed-off-by: Dave Airlie --- src/amd/vulkan/radv_cmd_buffer.c | 24 ++++++++----- src/amd/vulkan/radv_pipeline.c | 22 +++++++++++- src/amd/vulkan/radv_private.h | 12 +++++-- src/amd/vulkan/si_cmd_buffer.c | 78 +++++++++++++++++++++++++++++----------- 4 files changed, 105 insertions(+), 31 deletions(-) diff --git a/src/amd/vulkan/radv_cmd_buffer.c b/src/amd/vulkan/radv_cmd_buffer.c index 8f2e984..8a92ffb 100644 --- a/src/amd/vulkan/radv_cmd_buffer.c +++ b/src/amd/vulkan/radv_cmd_buffer.c @@ -1267,7 +1267,8 @@ radv_flush_constants(struct radv_cmd_buffer *cmd_buffer, } static void -radv_cmd_buffer_flush_state(struct radv_cmd_buffer *cmd_buffer) +radv_cmd_buffer_flush_state(struct radv_cmd_buffer *cmd_buffer, bool instanced_or_indirect_draw, + uint32_t draw_vertex_count) { struct radv_pipeline *pipeline = cmd_buffer->state.pipeline; struct radv_device *device = cmd_buffer->device; @@ -1332,6 +1333,15 @@ radv_cmd_buffer_flush_state(struct radv_cmd_buffer *cmd_buffer) if (cmd_buffer->state.dirty & (RADV_CMD_DIRTY_DYNAMIC_SCISSOR)) radv_emit_scissor(cmd_buffer); + ia_multi_vgt_param = si_get_ia_multi_vgt_param(cmd_buffer, instanced_or_indirect_draw, draw_vertex_count); + if (cmd_buffer->state.last_ia_multi_vgt_param != ia_multi_vgt_param) { + if (cmd_buffer->device->physical_device->rad_info.chip_class >= CIK) + radeon_set_context_reg_idx(cmd_buffer->cs, R_028AA8_IA_MULTI_VGT_PARAM, 1, ia_multi_vgt_param); + else + radeon_set_context_reg(cmd_buffer->cs, R_028AA8_IA_MULTI_VGT_PARAM, ia_multi_vgt_param); + cmd_buffer->state.last_ia_multi_vgt_param = ia_multi_vgt_param; + } + if (cmd_buffer->state.dirty & RADV_CMD_DIRTY_PIPELINE) { uint32_t stages = 0; @@ -1341,15 +1351,12 @@ radv_cmd_buffer_flush_state(struct radv_cmd_buffer *cmd_buffer) S_028B54_VS_EN(V_028B54_VS_STAGE_COPY_SHADER); radeon_set_context_reg(cmd_buffer->cs, R_028B54_VGT_SHADER_STAGES_EN, stages); - ia_multi_vgt_param = si_get_ia_multi_vgt_param(cmd_buffer); if (cmd_buffer->device->physical_device->rad_info.chip_class >= CIK) { - radeon_set_context_reg_idx(cmd_buffer->cs, R_028AA8_IA_MULTI_VGT_PARAM, 1, ia_multi_vgt_param); radeon_set_context_reg_idx(cmd_buffer->cs, R_028B58_VGT_LS_HS_CONFIG, 2, ls_hs_config); radeon_set_uconfig_reg_idx(cmd_buffer->cs, R_030908_VGT_PRIMITIVE_TYPE, 1, cmd_buffer->state.pipeline->graphics.prim); } else { radeon_set_config_reg(cmd_buffer->cs, R_008958_VGT_PRIMITIVE_TYPE, cmd_buffer->state.pipeline->graphics.prim); - radeon_set_context_reg(cmd_buffer->cs, R_028AA8_IA_MULTI_VGT_PARAM, ia_multi_vgt_param); radeon_set_context_reg(cmd_buffer->cs, R_028B58_VGT_LS_HS_CONFIG, ls_hs_config); } radeon_set_context_reg(cmd_buffer->cs, R_028A6C_VGT_GS_OUT_PRIM_TYPE, cmd_buffer->state.pipeline->graphics.gs_out); @@ -2188,7 +2195,8 @@ void radv_CmdDraw( uint32_t firstInstance) { RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); - radv_cmd_buffer_flush_state(cmd_buffer); + + radv_cmd_buffer_flush_state(cmd_buffer, (instanceCount > 1), vertexCount); MAYBE_UNUSED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 10); @@ -2239,7 +2247,7 @@ void radv_CmdDrawIndexed( uint32_t index_max_size = (cmd_buffer->state.index_buffer->size - cmd_buffer->state.index_offset) / index_size; uint64_t index_va; - radv_cmd_buffer_flush_state(cmd_buffer); + radv_cmd_buffer_flush_state(cmd_buffer, (instanceCount > 1), indexCount); radv_emit_primitive_reset_index(cmd_buffer); MAYBE_UNUSED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 15); @@ -2337,7 +2345,7 @@ radv_cmd_draw_indirect_count(VkCommandBuffer command uint32_t stride) { RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); - radv_cmd_buffer_flush_state(cmd_buffer); + radv_cmd_buffer_flush_state(cmd_buffer, true, 0); MAYBE_UNUSED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 14); @@ -2362,7 +2370,7 @@ radv_cmd_draw_indexed_indirect_count( int index_size = cmd_buffer->state.index_type ? 4 : 2; uint32_t index_max_size = (cmd_buffer->state.index_buffer->size - cmd_buffer->state.index_offset) / index_size; uint64_t index_va; - radv_cmd_buffer_flush_state(cmd_buffer); + radv_cmd_buffer_flush_state(cmd_buffer, true, 0); radv_emit_primitive_reset_index(cmd_buffer); index_va = cmd_buffer->device->ws->buffer_get_va(cmd_buffer->state.index_buffer->bo); diff --git a/src/amd/vulkan/radv_pipeline.c b/src/amd/vulkan/radv_pipeline.c index 53f06ac..cbd846a 100644 --- a/src/amd/vulkan/radv_pipeline.c +++ b/src/amd/vulkan/radv_pipeline.c @@ -1483,6 +1483,24 @@ calculate_gs_ring_sizes(struct radv_pipeline *pipeline) pipeline->graphics.gsvs_ring_size = MIN2(gsvs_ring_size, max_size); } +static const struct radv_prim_vertex_count prim_size_table[] = { + [V_008958_DI_PT_NONE] = {0, 0}, + [V_008958_DI_PT_POINTLIST] = {1, 1}, + [V_008958_DI_PT_LINELIST] = {2, 2}, + [V_008958_DI_PT_LINESTRIP] = {2, 1}, + [V_008958_DI_PT_TRILIST] = {3, 3}, + [V_008958_DI_PT_TRIFAN] = {3, 1}, + [V_008958_DI_PT_TRISTRIP] = {3, 1}, + [V_008958_DI_PT_LINELIST_ADJ] = {4, 4}, + [V_008958_DI_PT_LINESTRIP_ADJ] = {4, 1}, + [V_008958_DI_PT_TRILIST_ADJ] = {6, 6}, + [V_008958_DI_PT_TRISTRIP_ADJ] = {6, 2}, + [V_008958_DI_PT_RECTLIST] = {3, 3}, + [V_008958_DI_PT_LINELOOP] = {2, 1}, + [V_008958_DI_PT_POLYGON] = {3, 1}, + [V_008958_DI_PT_2D_TRI_STRIP] = {0, 0}, +}; + VkResult radv_pipeline_init(struct radv_pipeline *pipeline, struct radv_device *device, @@ -1581,7 +1599,9 @@ radv_pipeline_init(struct radv_pipeline *pipeline, pipeline->graphics.gs_out = V_028A6C_OUTPRIM_TYPE_TRISTRIP; } pipeline->graphics.prim_restart_enable = !!pCreateInfo->pInputAssemblyState->primitiveRestartEnable; - + /* prim vertex count will need TESS changes */ + pipeline->graphics.prim_vertex_count = prim_size_table[pipeline->graphics.prim]; + const VkPipelineVertexInputStateCreateInfo *vi_info = pCreateInfo->pVertexInputState; for (uint32_t i = 0; i < vi_info->vertexAttributeDescriptionCount; i++) { diff --git a/src/amd/vulkan/radv_private.h b/src/amd/vulkan/radv_private.h index dcccd94e..7b1d8fb 100644 --- a/src/amd/vulkan/radv_private.h +++ b/src/amd/vulkan/radv_private.h @@ -686,8 +686,8 @@ struct radv_attachment_state { struct radv_cmd_state { uint32_t vb_dirty; - bool vertex_descriptors_dirty; radv_cmd_dirty_mask_t dirty; + bool vertex_descriptors_dirty; struct radv_pipeline * pipeline; struct radv_pipeline * emitted_pipeline; @@ -710,6 +710,7 @@ struct radv_cmd_state { float offset_scale; uint32_t descriptors_dirty; uint32_t trace_id; + uint32_t last_ia_multi_vgt_param; }; struct radv_cmd_pool { @@ -771,7 +772,8 @@ void si_write_viewport(struct radeon_winsys_cs *cs, int first_vp, int count, const VkViewport *viewports); void si_write_scissors(struct radeon_winsys_cs *cs, int first, int count, const VkRect2D *scissors); -uint32_t si_get_ia_multi_vgt_param(struct radv_cmd_buffer *cmd_buffer); +uint32_t si_get_ia_multi_vgt_param(struct radv_cmd_buffer *cmd_buffer, + bool instanced_or_indirect_draw, uint32_t draw_vertex_count); void si_emit_cache_flush(struct radv_cmd_buffer *cmd_buffer); void si_cp_dma_buffer_copy(struct radv_cmd_buffer *cmd_buffer, uint64_t src_va, uint64_t dest_va, @@ -925,6 +927,11 @@ struct radv_multisample_state { unsigned num_samples; }; +struct radv_prim_vertex_count { + uint8_t min; + uint8_t incr; +}; + struct radv_pipeline { struct radv_device * device; uint32_t dynamic_state_mask; @@ -956,6 +963,7 @@ struct radv_pipeline { bool prim_restart_enable; unsigned esgs_ring_size; unsigned gsvs_ring_size; + struct radv_prim_vertex_count prim_vertex_count; } graphics; }; diff --git a/src/amd/vulkan/si_cmd_buffer.c b/src/amd/vulkan/si_cmd_buffer.c index 7d81c2b..e20e3bd 100644 --- a/src/amd/vulkan/si_cmd_buffer.c +++ b/src/amd/vulkan/si_cmd_buffer.c @@ -565,8 +565,25 @@ si_write_scissors(struct radeon_winsys_cs *cs, int first, } } +static inline unsigned +radv_prims_for_vertices(struct radv_prim_vertex_count *info, unsigned num) +{ + if (num == 0) + return 0; + + if (info->incr == 0) + return 0; + + if (num < info->min) + return 0; + + return 1 + ((num - info->min) / info->incr); +} + uint32_t -si_get_ia_multi_vgt_param(struct radv_cmd_buffer *cmd_buffer) +si_get_ia_multi_vgt_param(struct radv_cmd_buffer *cmd_buffer, + bool instanced_or_indirect_draw, + uint32_t draw_vertex_count) { enum chip_class chip_class = cmd_buffer->device->physical_device->rad_info.chip_class; enum radeon_family family = cmd_buffer->device->physical_device->rad_info.family; @@ -580,10 +597,14 @@ si_get_ia_multi_vgt_param(struct radv_cmd_buffer *cmd_buffer) bool ia_switch_on_eoi = false; bool partial_vs_wave = false; bool partial_es_wave = false; + uint32_t num_prims = radv_prims_for_vertices(&cmd_buffer->state.pipeline->graphics.prim_vertex_count, draw_vertex_count); + bool multi_instances_smaller_than_primgroup; if (radv_pipeline_has_gs(cmd_buffer->state.pipeline)) primgroup_size = 64; /* recommended with a GS */ + multi_instances_smaller_than_primgroup = (instanced_or_indirect_draw || + num_prims < primgroup_size); /* TODO TES */ /* TODO linestipple */ @@ -596,12 +617,30 @@ si_get_ia_multi_vgt_param(struct radv_cmd_buffer *cmd_buffer) prim == V_008958_DI_PT_POLYGON || prim == V_008958_DI_PT_LINELOOP || prim == V_008958_DI_PT_TRIFAN || - prim == V_008958_DI_PT_TRISTRIP_ADJ) - // info->primitive_restart || - // info->count_from_stream_output) + prim == V_008958_DI_PT_TRISTRIP_ADJ || + (cmd_buffer->state.pipeline->graphics.prim_restart_enable && + (family < CHIP_POLARIS10 || + (prim != V_008958_DI_PT_POINTLIST && + prim != V_008958_DI_PT_LINESTRIP && + prim != V_008958_DI_PT_TRISTRIP)))) wd_switch_on_eop = true; - /* TODO HAWAII */ + /* Hawaii hangs if instancing is enabled and WD_SWITCH_ON_EOP is 0. + * We don't know that for indirect drawing, so treat it as + * always problematic. */ + if (family == CHIP_HAWAII && + instanced_or_indirect_draw) + wd_switch_on_eop = true; + + /* Performance recommendation for 4 SE Gfx7-8 parts if + * instances are smaller than a primgroup. + * Assume indirect draws always use small instances. + * This is needed for good VS wave utilization. + */ + if (chip_class <= VI && + info->max_se == 4 && + multi_instances_smaller_than_primgroup) + wd_switch_on_eop = true; /* Required on CIK and later. */ if (info->max_se > 2 && !wd_switch_on_eop) @@ -614,12 +653,11 @@ si_get_ia_multi_vgt_param(struct radv_cmd_buffer *cmd_buffer) (radv_pipeline_has_gs(cmd_buffer->state.pipeline) || max_primgroup_in_wave != 2)))) partial_vs_wave = true; -#if 0 /* Instancing bug on Bonaire. */ if (family == CHIP_BONAIRE && ia_switch_on_eoi && - (info->indirect || info->instance_count > 1)) + instanced_or_indirect_draw) partial_vs_wave = true; -#endif + /* If the WD switch is false, the IA switch must be false too. */ assert(wd_switch_on_eop || !ia_switch_on_eop); } @@ -627,19 +665,19 @@ si_get_ia_multi_vgt_param(struct radv_cmd_buffer *cmd_buffer) if (ia_switch_on_eoi) partial_es_wave = true; - /* GS requirement. */ - if (SI_GS_PER_ES / primgroup_size >= cmd_buffer->device->gs_table_depth - 3) - partial_es_wave = true; + if (radv_pipeline_has_gs(cmd_buffer->state.pipeline)) { + /* GS requirement. */ + if (SI_GS_PER_ES / primgroup_size >= cmd_buffer->device->gs_table_depth - 3) + partial_es_wave = true; + + /* Hw bug with single-primitive instances and SWITCH_ON_EOI + * on multi-SE chips. */ + if (info->max_se >= 2 && ia_switch_on_eoi && + (instanced_or_indirect_draw && + num_prims <= 1)) + cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_VGT_FLUSH; + } - /* Hw bug with single-primitive instances and SWITCH_ON_EOI - * on multi-SE chips. */ -#if 0 - if (info->max_se >= 2 && ia_switch_on_eoi && - (info->indirect || - (info->instance_count > 1 && - si_num_prims_for_vertices(info) <= 1))) - sctx->b.flags |= SI_CONTEXT_VGT_FLUSH; -#endif return S_028AA8_SWITCH_ON_EOP(ia_switch_on_eop) | S_028AA8_SWITCH_ON_EOI(ia_switch_on_eoi) | S_028AA8_PARTIAL_VS_WAVE_ON(partial_vs_wave) | -- 2.7.4