radv: Support NGG culling with new perftest environment variable.

author Timur Kristóf <timur.kristof@gmail.com>

Tue, 8 Jun 2021 10:32:35 +0000 (12:32 +0200)

committer Marge Bot <eric+marge@anholt.net>

Tue, 13 Jul 2021 23:56:33 +0000 (23:56 +0000)
author Timur Kristóf <timur.kristof@gmail.com>
Tue, 8 Jun 2021 10:32:35 +0000 (12:32 +0200)
committer Marge Bot <eric+marge@anholt.net>
Tue, 13 Jul 2021 23:56:33 +0000 (23:56 +0000)
diff --git a/docs/envvars.rst b/docs/envvars.rst

index 1538f0a..df9c043 100644 (file)
--- a/docs/envvars.rst
+++ b/docs/envvars.rst
@@ -647,6 +647,8 @@ RADV driver environment variables
        disable optimizations that get enabled when all VRAM is CPU visible.
     ``pswave32``
        enable wave32 for pixel shaders (GFX10+)
+   ``nggc``
+      enable NGG culling on GFX10+ GPUs.
     ``rt``
        enable rt extensions whose implementation is still experimental.
     ``sam``
diff --git a/docs/relnotes/new_features.txt b/docs/relnotes/new_features.txt

index 495a354..f00cbf6 100644 (file)
--- a/docs/relnotes/new_features.txt
+++ b/docs/relnotes/new_features.txt
@@ -15,6 +15,7 @@ VK_EXT_multi_draw on ANV, lavapipe, and RADV
  VK_KHR_separate_depth_stencil_layouts on lavapipe
  VK_EXT_separate_stencil_usage on lavapipe
  VK_EXT_extended_dynamic_state2 on lavapipe
+NGG shader based primitive culling is now supported by RADV.
  Panfrost supports OpenGL ES 3.1
  New Asahi driver for the Apple M1
  GL_ARB_sample_locations on zink
diff --git a/src/amd/vulkan/radv_debug.h b/src/amd/vulkan/radv_debug.h

index 88e8c53..5a0f295 100644 (file)
--- a/src/amd/vulkan/radv_debug.h
+++ b/src/amd/vulkan/radv_debug.h
@@ -74,6 +74,7 @@ enum {
     RADV_PERFTEST_NO_SAM = 1u << 6,
     RADV_PERFTEST_SAM = 1u << 7,
     RADV_PERFTEST_RT = 1u << 8,
+   RADV_PERFTEST_NGGC = 1u << 9,
  };
  
  bool radv_init_trace(struct radv_device *device);
diff --git a/src/amd/vulkan/radv_device.c b/src/amd/vulkan/radv_device.c

index 1ae5a2e..738f68d 100644 (file)
--- a/src/amd/vulkan/radv_device.c
+++ b/src/amd/vulkan/radv_device.c
@@ -830,6 +830,7 @@ static const struct debug_control radv_perftest_options[] = {{"localbos", RADV_P
                                                               {"nosam", RADV_PERFTEST_NO_SAM},
                                                               {"sam", RADV_PERFTEST_SAM},
                                                               {"rt", RADV_PERFTEST_RT},
+                                                             {"nggc", RADV_PERFTEST_NGGC},
                                                               {NULL, 0}};
  
  const char *
diff --git a/src/amd/vulkan/radv_pipeline.c b/src/amd/vulkan/radv_pipeline.c

index 9b08c1b..3dfaa44 100644 (file)
--- a/src/amd/vulkan/radv_pipeline.c
+++ b/src/amd/vulkan/radv_pipeline.c
@@ -211,6 +211,8 @@ get_hash_flags(const struct radv_device *device, bool stats)
  
     if (device->instance->debug_flags & RADV_DEBUG_NO_NGG)
        hash_flags |= RADV_HASH_SHADER_NO_NGG;
+   if (device->instance->perftest_flags & RADV_PERFTEST_NGGC)
+      hash_flags |= RADV_HASH_SHADER_FORCE_NGG_CULLING;
     if (device->physical_device->cs_wave_size == 32)
        hash_flags |= RADV_HASH_SHADER_CS_WAVE32;
     if (device->physical_device->ps_wave_size == 32)
@@ -3451,8 +3453,11 @@ radv_create_shaders(struct radv_pipeline *pipeline, struct radv_device *device,
           bool io_to_mem = radv_lower_io_to_mem(device, nir[i], &infos[i], pipeline_key);
           bool lowered_ngg = pipeline_has_ngg && i == pipeline->graphics.last_vgt_api_stage &&
                              !radv_use_llvm_for_stage(device, i);
-         if (lowered_ngg)
-            radv_lower_ngg(device, nir[i], &infos[i], pipeline_key, &keys[i]);
+         if (lowered_ngg) {
+            uint64_t ps_inputs_read = nir[MESA_SHADER_FRAGMENT] ? nir[MESA_SHADER_FRAGMENT]->info.inputs_read : 0;
+            bool consider_culling = radv_consider_culling(device, nir[i], ps_inputs_read);
+            radv_lower_ngg(device, nir[i], &infos[i], pipeline_key, &keys[i], consider_culling);
+         }
  
           radv_optimize_nir_algebraic(nir[i], io_to_mem || lowered_ngg || i == MESA_SHADER_COMPUTE);
  
diff --git a/src/amd/vulkan/radv_private.h b/src/amd/vulkan/radv_private.h

index b99bea0..045af48 100644 (file)
--- a/src/amd/vulkan/radv_private.h
+++ b/src/amd/vulkan/radv_private.h
@@ -1672,6 +1672,7 @@ struct radv_event {
  #define RADV_HASH_SHADER_FORCE_VRS_2x2     (1 << 9)
  #define RADV_HASH_SHADER_FORCE_VRS_2x1     (1 << 10)
  #define RADV_HASH_SHADER_FORCE_VRS_1x2     (1 << 11)
+#define RADV_HASH_SHADER_FORCE_NGG_CULLING (1 << 13)
  
  void radv_hash_shaders(unsigned char *hash, const VkPipelineShaderStageCreateInfo **stages,
                         const struct radv_pipeline_layout *layout,
diff --git a/src/amd/vulkan/radv_shader.c b/src/amd/vulkan/radv_shader.c

index 5a59e7f..96bdb2c 100644 (file)
--- a/src/amd/vulkan/radv_shader.c
+++ b/src/amd/vulkan/radv_shader.c
@@ -907,10 +907,44 @@ radv_lower_io_to_mem(struct radv_device *device, struct nir_shader *nir,
     return false;
  }
  
+bool
+radv_consider_culling(struct radv_device *device, struct nir_shader *nir,
+                      uint64_t ps_inputs_read)
+{
+   /* Culling doesn't make sense for meta shaders. */
+   if (!!nir->info.name)
+      return false;
+
+   /* TODO: enable by default on GFX10.3 when we're confident about performance. */
+   bool culling_enabled = device->instance->perftest_flags & RADV_PERFTEST_NGGC;
+
+   if (!culling_enabled)
+      return false;
+
+   /* Shader based culling efficiency can depend on PS throughput.
+    * Estimate an upper limit for PS input param count based on GPU info.
+    */
+   unsigned max_ps_params;
+   unsigned max_render_backends = device->physical_device->rad_info.max_render_backends;
+   unsigned max_se = device->physical_device->rad_info.max_se;
+
+   if (max_render_backends < 2)
+      return false; /* Don't use NGG culling on 1 RB chips. */
+   else if (max_render_backends / max_se == 4)
+      max_ps_params = 6; /* Sienna Cichlid and other GFX10.3 dGPUs. */
+   else
+      max_ps_params = 4; /* Navi 1x. */
+
+   /* TODO: consider other heuristics here, such as PS execution time */
+
+   return util_bitcount64(ps_inputs_read & ~VARYING_BIT_POS) <= max_ps_params;
+}
+
  void radv_lower_ngg(struct radv_device *device, struct nir_shader *nir,
                      struct radv_shader_info *info,
                      const struct radv_pipeline_key *pl_key,
-                    struct radv_shader_variant_key *key)
+                    struct radv_shader_variant_key *key,
+                    bool consider_culling)
  {
     /* TODO: support the LLVM backend with the NIR lowering */
     assert(!radv_use_llvm_for_stage(device, nir->info.stage));
@@ -930,9 +964,19 @@ void radv_lower_ngg(struct radv_device *device, struct nir_shader *nir,
           num_vertices_per_prim = 1;
        else if (nir->info.tess.primitive_mode == GL_ISOLINES)
           num_vertices_per_prim = 2;
+
+      /* Manually mark the primitive ID used, so the shader can repack it. */
+      if (key->vs_common_out.export_prim_id)
+         BITSET_SET(nir->info.system_values_read, SYSTEM_VALUE_PRIMITIVE_ID);
+
     } else if (nir->info.stage == MESA_SHADER_VERTEX) {
        /* Need to add 1, because: V_028A6C_POINTLIST=0, V_028A6C_LINESTRIP=1, V_028A6C_TRISTRIP=2, etc. */
        num_vertices_per_prim = key->vs.outprim + 1;
+
+      /* Manually mark the instance ID used, so the shader can repack it. */
+      if (key->vs.instance_rate_inputs)
+         BITSET_SET(nir->info.system_values_read, SYSTEM_VALUE_INSTANCE_ID);
+
     } else if (nir->info.stage == MESA_SHADER_GEOMETRY) {
        num_vertices_per_prim = nir->info.gs.vertices_in;
     } else {
@@ -964,7 +1008,7 @@ void radv_lower_ngg(struct radv_device *device, struct nir_shader *nir,
              num_vertices_per_prim,
              max_workgroup_size,
              info->wave_size,
-            false,
+            consider_culling,
              key->vs_common_out.as_ngg_passthrough,
              key->vs_common_out.export_prim_id,
              key->vs.provoking_vtx_last);
diff --git a/src/amd/vulkan/radv_shader.h b/src/amd/vulkan/radv_shader.h

index 1ad54b9..ab3dcac 100644 (file)
--- a/src/amd/vulkan/radv_shader.h
+++ b/src/amd/vulkan/radv_shader.h
@@ -569,6 +569,10 @@ bool radv_lower_io_to_mem(struct radv_device *device, struct nir_shader *nir,
  void radv_lower_ngg(struct radv_device *device, struct nir_shader *nir,
                      struct radv_shader_info *info,
                      const struct radv_pipeline_key *pl_key,
-                    struct radv_shader_variant_key *key);
+                    struct radv_shader_variant_key *key,
+                    bool consider_culling);
+
+bool radv_consider_culling(struct radv_device *device, struct nir_shader *nir,
+                           uint64_t ps_inputs_read);
  
  #endif
author	Timur Kristóf <timur.kristof@gmail.com>
	Tue, 8 Jun 2021 10:32:35 +0000 (12:32 +0200)
committer	Marge Bot <eric+marge@anholt.net>
	Tue, 13 Jul 2021 23:56:33 +0000 (23:56 +0000)
docs/envvars.rst		patch \| blob \| history
docs/relnotes/new_features.txt		patch \| blob \| history
src/amd/vulkan/radv_debug.h		patch \| blob \| history
src/amd/vulkan/radv_device.c		patch \| blob \| history
src/amd/vulkan/radv_pipeline.c		patch \| blob \| history
src/amd/vulkan/radv_private.h		patch \| blob \| history
src/amd/vulkan/radv_shader.c		patch \| blob \| history
src/amd/vulkan/radv_shader.h		patch \| blob \| history