radv: enable NGG culling unconditionally for GPL but disable it dynamically
authorSamuel Pitoiset <samuel.pitoiset@gmail.com>
Mon, 19 Sep 2022 16:51:20 +0000 (18:51 +0200)
committerSamuel Pitoiset <samuel.pitoiset@gmail.com>
Mon, 26 Sep 2022 07:28:14 +0000 (09:28 +0200)
When compiling the pre-rasterization stages we don't know the primitive
topology, but we still want to enable NGG culling for performance. To
achieve that, NGG culling is enabled unconditionally when the topology
is unknown and disabled dynamically for points or lines.

Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Reviewed-by: Timur Kristóf <timur.kristof@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/18776>

src/amd/vulkan/radv_cmd_buffer.c
src/amd/vulkan/radv_private.h
src/amd/vulkan/radv_shader.c
src/amd/vulkan/radv_shader_info.c

index e4e1409..a2106df 100644 (file)
@@ -7063,16 +7063,33 @@ radv_need_late_scissor_emission(struct radv_cmd_buffer *cmd_buffer,
 }
 
 ALWAYS_INLINE static bool
-radv_skip_ngg_culling(bool has_tess, const unsigned vtx_cnt,
-                      bool indirect)
+radv_skip_ngg_culling(struct radv_cmd_buffer *cmd_buffer,
+                      const struct radv_graphics_pipeline *pipeline,
+                      const struct radv_draw_info *draw_info)
 {
+   const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
+
    /* If we have to draw only a few vertices, we get better latency if
     * we disable NGG culling.
     *
     * When tessellation is used, what matters is the number of tessellated
     * vertices, so let's always assume it's not a small draw.
     */
-   return !has_tess && !indirect && vtx_cnt < 128;
+   if (pipeline->last_vgt_api_stage != MESA_SHADER_VERTEX)
+      return false;
+
+   if (!draw_info->indirect && draw_info->count < 128)
+      return true;
+
+   /* With graphics pipeline library, NGG culling is enabled unconditionally because we don't know
+    * the primitive topology at compile time, but we should still disable it dynamically for points
+    * or lines.
+    */
+   unsigned num_vertices_per_prim = si_conv_prim_to_gs_out(d->primitive_topology) + 1;
+   if (num_vertices_per_prim != 3)
+      return true;
+
+   return false;
 }
 
 ALWAYS_INLINE static uint32_t
@@ -7148,8 +7165,7 @@ radv_emit_ngg_culling_state(struct radv_cmd_buffer *cmd_buffer, const struct rad
    /* Check small draw status:
     * For small draw calls, we disable culling by setting the SGPR to 0.
     */
-   const bool skip =
-      radv_skip_ngg_culling(stage == MESA_SHADER_TESS_EVAL, draw_info->count, draw_info->indirect);
+   const bool skip = radv_skip_ngg_culling(cmd_buffer, pipeline, draw_info);
 
    /* See if anything changed. */
    if (!dirty && skip == cmd_buffer->state.last_nggc_skip)
index 4e05c42..ca0cd06 100644 (file)
@@ -2958,6 +2958,22 @@ radv_rast_prim_is_points_or_lines(unsigned rast_prim)
    return radv_rast_prim_is_point(rast_prim) || radv_rast_prim_is_line(rast_prim);
 }
 
+static inline unsigned
+radv_get_num_vertices_per_prim(const struct radv_pipeline_key *pipeline_key)
+{
+   if (pipeline_key->vs.topology == V_008958_DI_PT_NONE) {
+      /* When the topology is unknown (with graphics pipeline library), return the maximum number of
+       * vertices per primitives for VS. This is used to lower NGG (the HW will ignore the extra
+       * bits for points/lines) and also to enable NGG culling unconditionally (it will be disabled
+       * dynamically for points/lines).
+       */
+      return 3;
+   } else {
+      /* Need to add 1, because: V_028A6C_POINTLIST=0, V_028A6C_LINESTRIP=1, V_028A6C_TRISTRIP=2, etc. */
+      return si_conv_prim_to_gs_out(pipeline_key->vs.topology) + 1;
+   }
+}
+
 static inline uint32_t
 si_translate_stencil_op(enum VkStencilOp op)
 {
index 0a1e55e..58f5d1e 100644 (file)
@@ -1347,16 +1347,7 @@ void radv_lower_ngg(struct radv_device *device, struct radv_pipeline_stage *ngg_
          BITSET_SET(nir->info.system_values_read, SYSTEM_VALUE_PRIMITIVE_ID);
 
    } else if (nir->info.stage == MESA_SHADER_VERTEX) {
-      if (pl_key->vs.topology == V_008958_DI_PT_NONE) {
-         /* When the topology is unknown (with graphics pipeline library), use the maximum number of
-          * vertices per primitives for simplicity, the HW will ignore the extra bits if points or
-          * lines are used anyways.
-          */
-         num_vertices_per_prim = 3;
-      } else {
-         /* Need to add 1, because: V_028A6C_POINTLIST=0, V_028A6C_LINESTRIP=1, V_028A6C_TRISTRIP=2, etc. */
-         num_vertices_per_prim = si_conv_prim_to_gs_out(pl_key->vs.topology) + 1;
-      }
+      num_vertices_per_prim = radv_get_num_vertices_per_prim(pl_key);
 
       /* Manually mark the instance ID used, so the shader can repack it. */
       if (pl_key->vs.instance_rate_inputs)
index 3ddcdaa..2b6463f 100644 (file)
@@ -1207,8 +1207,10 @@ radv_determine_ngg_settings(struct radv_device *device, struct radv_pipeline_sta
 
    uint64_t ps_inputs_read = fs_stage->nir->info.inputs_read;
 
-   unsigned num_vertices_per_prim = si_conv_prim_to_gs_out(pipeline_key->vs.topology) + 1;
-   if (es_stage->stage == MESA_SHADER_TESS_EVAL) {
+   unsigned num_vertices_per_prim = 0;
+   if (es_stage->stage == MESA_SHADER_VERTEX) {
+      num_vertices_per_prim = radv_get_num_vertices_per_prim(pipeline_key);
+   } else if (es_stage->stage == MESA_SHADER_TESS_EVAL) {
       num_vertices_per_prim = es_stage->nir->info.tess.point_mode ? 1 :
          es_stage->nir->info.tess._primitive_mode == TESS_PRIMITIVE_ISOLINES ? 2 : 3;
    }