From: Danylo Piliaiev Date: Fri, 18 Feb 2022 17:15:03 +0000 (+0200) Subject: turnip: Implement VK_ARM_rasterization_order_attachment_access X-Git-Tag: upstream/22.3.5~12371 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=ebc23ac963db2710553118de5b86caa7681ba774;p=platform%2Fupstream%2Fmesa.git turnip: Implement VK_ARM_rasterization_order_attachment_access Trivially implemented by using A6XX_GRAS_SC_CNTL_SINGLE_PRIM_MODE. This extension is useful for emulators e.g. AetherSX2 PS2 emulator and could drastically improve performance when blending is emulated. Relevant tests: dEQP-VK.rasterization.rasterization_order_attachment_access.* Signed-off-by: Danylo Piliaiev Part-of: --- diff --git a/docs/features.txt b/docs/features.txt index de5a1aa..cf48003 100644 --- a/docs/features.txt +++ b/docs/features.txt @@ -598,6 +598,7 @@ Khronos extensions that are not part of any Vulkan version: VK_AMD_shader_info DONE (radv) VK_AMD_shader_trinary_minmax DONE (radv) VK_AMD_texture_gather_bias_lod DONE (radv) + VK_ARM_rasterization_order_attachment_access DONE (tu) OpenCL 1.0 -- all DONE: diff --git a/src/freedreno/vulkan/tu_cmd_buffer.c b/src/freedreno/vulkan/tu_cmd_buffer.c index 62149e6..f6dc6ea 100644 --- a/src/freedreno/vulkan/tu_cmd_buffer.c +++ b/src/freedreno/vulkan/tu_cmd_buffer.c @@ -304,26 +304,6 @@ tu6_emit_mrt(struct tu_cmd_buffer *cmd, unsigned layers = MAX2(fb->layers, util_logbase2(subpass->multiview_mask) + 1); tu_cs_emit_regs(cs, A6XX_GRAS_MAX_LAYER_INDEX(layers - 1)); - - tu_cs_emit_write_reg(cs, REG_A6XX_GRAS_SC_CNTL, - A6XX_GRAS_SC_CNTL_CCUSINGLECACHELINESIZE(2)); - - /* If there is a feedback loop, then the shader can read the previous value - * of a pixel being written out. It can also write some components and then - * read different components without a barrier in between. This is a - * problem in sysmem mode with UBWC, because the main buffer and flags - * buffer can get out-of-sync if only one is flushed. We fix this by - * setting the SINGLE_PRIM_MODE field to the same value that the blob does - * for advanced_blend in sysmem mode if a feedback loop is detected. - */ - if (subpass->feedback_loop_color || subpass->feedback_loop_ds) { - tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_SYSMEM); - tu_cs_emit_write_reg(cs, REG_A6XX_GRAS_SC_CNTL, - A6XX_GRAS_SC_CNTL_CCUSINGLECACHELINESIZE(2) | - A6XX_GRAS_SC_CNTL_SINGLE_PRIM_MODE( - FLUSH_PER_OVERLAP_AND_OVERWRITE)); - tu_cond_exec_end(cs); - } } void @@ -535,9 +515,11 @@ tu_cs_emit_draw_state(struct tu_cs *cs, uint32_t id, struct tu_draw_state state) enable_mask = CP_SET_DRAW_STATE__0_BINNING; break; case TU_DRAW_STATE_INPUT_ATTACHMENTS_GMEM: + case TU_DRAW_STATE_PRIM_MODE_GMEM: enable_mask = CP_SET_DRAW_STATE__0_GMEM; break; case TU_DRAW_STATE_INPUT_ATTACHMENTS_SYSMEM: + case TU_DRAW_STATE_PRIM_MODE_SYSMEM: enable_mask = CP_SET_DRAW_STATE__0_SYSMEM; break; default: @@ -2329,7 +2311,7 @@ tu_CmdBindPipeline(VkCommandBuffer commandBuffer, if (!(cmd->state.dirty & TU_CMD_DIRTY_DRAW_STATE)) { uint32_t mask = ~pipeline->dynamic_state_mask & BITFIELD_MASK(TU_DYNAMIC_STATE_COUNT); - tu_cs_emit_pkt7(cs, CP_SET_DRAW_STATE, 3 * (7 + util_bitcount(mask))); + tu_cs_emit_pkt7(cs, CP_SET_DRAW_STATE, 3 * (9 + util_bitcount(mask))); tu_cs_emit_draw_state(cs, TU_DRAW_STATE_PROGRAM_CONFIG, pipeline->program.config_state); tu_cs_emit_draw_state(cs, TU_DRAW_STATE_PROGRAM, pipeline->program.state); tu_cs_emit_draw_state(cs, TU_DRAW_STATE_PROGRAM_BINNING, pipeline->program.binning_state); @@ -2337,6 +2319,8 @@ tu_CmdBindPipeline(VkCommandBuffer commandBuffer, tu_cs_emit_draw_state(cs, TU_DRAW_STATE_VI_BINNING, pipeline->vi.binning_state); tu_cs_emit_draw_state(cs, TU_DRAW_STATE_RAST, pipeline->rast_state); tu_cs_emit_draw_state(cs, TU_DRAW_STATE_BLEND, pipeline->blend_state); + tu_cs_emit_draw_state(cs, TU_DRAW_STATE_PRIM_MODE_SYSMEM, pipeline->prim_order_state_sysmem); + tu_cs_emit_draw_state(cs, TU_DRAW_STATE_PRIM_MODE_GMEM, pipeline->prim_order_state_gmem); u_foreach_bit(i, mask) tu_cs_emit_draw_state(cs, TU_DRAW_STATE_DYNAMIC + i, pipeline->dynamic_state[i]); @@ -4000,6 +3984,8 @@ tu6_draw_common(struct tu_cmd_buffer *cmd, tu_cs_emit_draw_state(cs, TU_DRAW_STATE_VI_BINNING, pipeline->vi.binning_state); tu_cs_emit_draw_state(cs, TU_DRAW_STATE_RAST, pipeline->rast_state); tu_cs_emit_draw_state(cs, TU_DRAW_STATE_BLEND, pipeline->blend_state); + tu_cs_emit_draw_state(cs, TU_DRAW_STATE_PRIM_MODE_SYSMEM, pipeline->prim_order_state_sysmem); + tu_cs_emit_draw_state(cs, TU_DRAW_STATE_PRIM_MODE_GMEM, pipeline->prim_order_state_gmem); tu_cs_emit_draw_state(cs, TU_DRAW_STATE_SHADER_GEOM_CONST, cmd->state.shader_const[0]); tu_cs_emit_draw_state(cs, TU_DRAW_STATE_FS_CONST, cmd->state.shader_const[1]); tu_cs_emit_draw_state(cs, TU_DRAW_STATE_DESC_SETS, cmd->state.desc_sets); diff --git a/src/freedreno/vulkan/tu_device.c b/src/freedreno/vulkan/tu_device.c index 6caee82..49f6235 100644 --- a/src/freedreno/vulkan/tu_device.c +++ b/src/freedreno/vulkan/tu_device.c @@ -205,6 +205,7 @@ get_device_extensions(const struct tu_physical_device *device, .EXT_image_robustness = true, /* For Graphics Flight Recorder (GFR) */ .AMD_buffer_marker = true, + .ARM_rasterization_order_attachment_access = true, #ifdef ANDROID .ANDROID_native_buffer = true, #endif @@ -810,6 +811,14 @@ tu_GetPhysicalDeviceFeatures2(VkPhysicalDevice physicalDevice, features->primitiveTopologyPatchListRestart = false; break; } + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_RASTERIZATION_ORDER_ATTACHMENT_ACCESS_FEATURES_ARM: { + VkPhysicalDeviceRasterizationOrderAttachmentAccessFeaturesARM *features = + (VkPhysicalDeviceRasterizationOrderAttachmentAccessFeaturesARM *)ext; + features->rasterizationOrderColorAttachmentAccess = true; + features->rasterizationOrderDepthAttachmentAccess = true; + features->rasterizationOrderStencilAttachmentAccess = true; + break; + } default: break; diff --git a/src/freedreno/vulkan/tu_pass.c b/src/freedreno/vulkan/tu_pass.c index 01c05fa..3b892be 100644 --- a/src/freedreno/vulkan/tu_pass.c +++ b/src/freedreno/vulkan/tu_pass.c @@ -750,6 +750,13 @@ tu_CreateRenderPass2(VkDevice _device, subpass->samples = 0; subpass->srgb_cntl = 0; + const VkSubpassDescriptionFlagBits raster_order_access_bits = + VK_SUBPASS_DESCRIPTION_RASTERIZATION_ORDER_ATTACHMENT_COLOR_ACCESS_BIT_ARM | + VK_SUBPASS_DESCRIPTION_RASTERIZATION_ORDER_ATTACHMENT_DEPTH_ACCESS_BIT_ARM | + VK_SUBPASS_DESCRIPTION_RASTERIZATION_ORDER_ATTACHMENT_STENCIL_ACCESS_BIT_ARM; + + subpass->raster_order_attachment_access = desc->flags & raster_order_access_bits; + subpass->multiview_mask = desc->viewMask; if (desc->inputAttachmentCount > 0) { diff --git a/src/freedreno/vulkan/tu_pipeline.c b/src/freedreno/vulkan/tu_pipeline.c index 78cb64f..49332f3 100644 --- a/src/freedreno/vulkan/tu_pipeline.c +++ b/src/freedreno/vulkan/tu_pipeline.c @@ -274,6 +274,8 @@ struct tu_pipeline_builder uint32_t render_components; uint32_t multiview_mask; + bool subpass_raster_order_attachment_access; + bool subpass_feedback_loop_color; bool subpass_feedback_loop_ds; }; @@ -3152,6 +3154,78 @@ tu_pipeline_builder_parse_multisample_and_color_blend( } static void +tu_pipeline_builder_parse_rasterization_order( + struct tu_pipeline_builder *builder, struct tu_pipeline *pipeline) +{ + if (builder->rasterizer_discard) + return; + + pipeline->subpass_feedback_loop_ds = builder->subpass_feedback_loop_ds; + + const VkPipelineColorBlendStateCreateInfo *blend_info = + builder->create_info->pColorBlendState; + + const VkPipelineDepthStencilStateCreateInfo *ds_info = + builder->create_info->pDepthStencilState; + + if (builder->use_color_attachments) { + pipeline->raster_order_attachment_access = + blend_info->flags & + VK_PIPELINE_COLOR_BLEND_STATE_CREATE_RASTERIZATION_ORDER_ATTACHMENT_ACCESS_BIT_ARM; + } + + if (builder->depth_attachment_format != VK_FORMAT_UNDEFINED) { + pipeline->raster_order_attachment_access |= + ds_info->flags & + (VK_PIPELINE_DEPTH_STENCIL_STATE_CREATE_RASTERIZATION_ORDER_ATTACHMENT_DEPTH_ACCESS_BIT_ARM | + VK_PIPELINE_DEPTH_STENCIL_STATE_CREATE_RASTERIZATION_ORDER_ATTACHMENT_STENCIL_ACCESS_BIT_ARM); + } + + /* VK_EXT_blend_operation_advanced would also require ordered access + * when implemented in the future. + */ + + uint32_t sysmem_prim_mode = NO_FLUSH; + uint32_t gmem_prim_mode = NO_FLUSH; + + if (pipeline->raster_order_attachment_access) { + /* VK_ARM_rasterization_order_attachment_access: + * + * This extension allow access to framebuffer attachments when used as + * both input and color attachments from one fragment to the next, + * in rasterization order, without explicit synchronization. + */ + sysmem_prim_mode = FLUSH_PER_OVERLAP_AND_OVERWRITE; + gmem_prim_mode = FLUSH_PER_OVERLAP; + } else { + /* If there is a feedback loop, then the shader can read the previous value + * of a pixel being written out. It can also write some components and then + * read different components without a barrier in between. This is a + * problem in sysmem mode with UBWC, because the main buffer and flags + * buffer can get out-of-sync if only one is flushed. We fix this by + * setting the SINGLE_PRIM_MODE field to the same value that the blob does + * for advanced_blend in sysmem mode if a feedback loop is detected. + */ + if (builder->subpass_feedback_loop_color || + builder->subpass_feedback_loop_ds) { + sysmem_prim_mode = FLUSH_PER_OVERLAP_AND_OVERWRITE; + } + } + + struct tu_cs cs; + + pipeline->prim_order_state_gmem = tu_cs_draw_state(&pipeline->cs, &cs, 2); + tu_cs_emit_write_reg(&cs, REG_A6XX_GRAS_SC_CNTL, + A6XX_GRAS_SC_CNTL_CCUSINGLECACHELINESIZE(2) | + A6XX_GRAS_SC_CNTL_SINGLE_PRIM_MODE(gmem_prim_mode)); + + pipeline->prim_order_state_sysmem = tu_cs_draw_state(&pipeline->cs, &cs, 2); + tu_cs_emit_write_reg(&cs, REG_A6XX_GRAS_SC_CNTL, + A6XX_GRAS_SC_CNTL_CCUSINGLECACHELINESIZE(2) | + A6XX_GRAS_SC_CNTL_SINGLE_PRIM_MODE(sysmem_prim_mode)); +} + +static void tu_pipeline_finish(struct tu_pipeline *pipeline, struct tu_device *dev, const VkAllocationCallbacks *alloc) @@ -3176,7 +3250,6 @@ tu_pipeline_builder_build(struct tu_pipeline_builder *builder, return VK_ERROR_OUT_OF_HOST_MEMORY; (*pipeline)->layout = builder->layout; - (*pipeline)->subpass_feedback_loop_ds = builder->subpass_feedback_loop_ds; (*pipeline)->executables_mem_ctx = ralloc_context(NULL); util_dynarray_init(&(*pipeline)->executables, (*pipeline)->executables_mem_ctx); @@ -3236,6 +3309,7 @@ tu_pipeline_builder_build(struct tu_pipeline_builder *builder, tu_pipeline_builder_parse_rasterization(builder, *pipeline); tu_pipeline_builder_parse_depth_stencil(builder, *pipeline); tu_pipeline_builder_parse_multisample_and_color_blend(builder, *pipeline); + tu_pipeline_builder_parse_rasterization_order(builder, *pipeline); tu6_emit_load_state(*pipeline, false); /* we should have reserved enough space upfront such that the CS never @@ -3290,6 +3364,9 @@ tu_pipeline_builder_init_graphics( const struct tu_subpass *subpass = &pass->subpasses[create_info->subpass]; + builder->subpass_raster_order_attachment_access = + subpass->raster_order_attachment_access; + builder->subpass_feedback_loop_color = subpass->feedback_loop_color; builder->subpass_feedback_loop_ds = subpass->feedback_loop_ds; builder->multiview_mask = subpass->multiview_mask; diff --git a/src/freedreno/vulkan/tu_private.h b/src/freedreno/vulkan/tu_private.h index b9ec471..b3e7d48 100644 --- a/src/freedreno/vulkan/tu_private.h +++ b/src/freedreno/vulkan/tu_private.h @@ -616,6 +616,8 @@ enum tu_draw_state_group_id TU_DRAW_STATE_INPUT_ATTACHMENTS_GMEM, TU_DRAW_STATE_INPUT_ATTACHMENTS_SYSMEM, TU_DRAW_STATE_LRZ_AND_DEPTH_PLANE, + TU_DRAW_STATE_PRIM_MODE_GMEM, + TU_DRAW_STATE_PRIM_MODE_SYSMEM, /* dynamic state related draw states */ TU_DRAW_STATE_DYNAMIC, @@ -1317,6 +1319,7 @@ struct tu_pipeline /* draw states for the pipeline */ struct tu_draw_state load_state, rast_state, blend_state; + struct tu_draw_state prim_order_state_sysmem, prim_order_state_gmem; /* for vertex buffers state */ uint32_t num_vbs; @@ -1359,6 +1362,8 @@ struct tu_pipeline struct tu_lrz_pipeline lrz; + /* In other words - framebuffer fetch support */ + bool raster_order_attachment_access; bool subpass_feedback_loop_ds; /* Base drawcall cost for sysmem vs gmem autotuner */ @@ -1701,6 +1706,9 @@ struct tu_subpass /* True if we must invalidate UCHE thanks to a feedback loop. */ bool feedback_invalidate; + /* In other words - framebuffer fetch support */ + bool raster_order_attachment_access; + struct tu_subpass_attachment *input_attachments; struct tu_subpass_attachment *color_attachments; struct tu_subpass_attachment *resolve_attachments;