From f30e4351de2f562955435a04cf75dd641639d31c Mon Sep 17 00:00:00 2001 From: =?utf8?q?Timur=20Krist=C3=B3f?= Date: Tue, 8 Jun 2021 12:32:35 +0200 Subject: [PATCH] radv: Support NGG culling with new perftest environment variable. MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Currently we don't enable it on any chip by default, but we plan to enable it soon on GFX10.3 when we are comfortable with its performance. RADV_PERFTEST=nggc environment variable enables it on GFX10+ GPUs. Signed-off-by: Timur Kristóf Reviewed-by: Samuel Pitoiset Part-of: --- docs/envvars.rst | 2 ++ docs/relnotes/new_features.txt | 1 + src/amd/vulkan/radv_debug.h | 1 + src/amd/vulkan/radv_device.c | 1 + src/amd/vulkan/radv_pipeline.c | 9 ++++++-- src/amd/vulkan/radv_private.h | 1 + src/amd/vulkan/radv_shader.c | 48 ++++++++++++++++++++++++++++++++++++++++-- src/amd/vulkan/radv_shader.h | 6 +++++- 8 files changed, 64 insertions(+), 5 deletions(-) diff --git a/docs/envvars.rst b/docs/envvars.rst index 1538f0a..df9c043 100644 --- a/docs/envvars.rst +++ b/docs/envvars.rst @@ -647,6 +647,8 @@ RADV driver environment variables disable optimizations that get enabled when all VRAM is CPU visible. ``pswave32`` enable wave32 for pixel shaders (GFX10+) + ``nggc`` + enable NGG culling on GFX10+ GPUs. ``rt`` enable rt extensions whose implementation is still experimental. ``sam`` diff --git a/docs/relnotes/new_features.txt b/docs/relnotes/new_features.txt index 495a354..f00cbf6 100644 --- a/docs/relnotes/new_features.txt +++ b/docs/relnotes/new_features.txt @@ -15,6 +15,7 @@ VK_EXT_multi_draw on ANV, lavapipe, and RADV VK_KHR_separate_depth_stencil_layouts on lavapipe VK_EXT_separate_stencil_usage on lavapipe VK_EXT_extended_dynamic_state2 on lavapipe +NGG shader based primitive culling is now supported by RADV. Panfrost supports OpenGL ES 3.1 New Asahi driver for the Apple M1 GL_ARB_sample_locations on zink diff --git a/src/amd/vulkan/radv_debug.h b/src/amd/vulkan/radv_debug.h index 88e8c53..5a0f295 100644 --- a/src/amd/vulkan/radv_debug.h +++ b/src/amd/vulkan/radv_debug.h @@ -74,6 +74,7 @@ enum { RADV_PERFTEST_NO_SAM = 1u << 6, RADV_PERFTEST_SAM = 1u << 7, RADV_PERFTEST_RT = 1u << 8, + RADV_PERFTEST_NGGC = 1u << 9, }; bool radv_init_trace(struct radv_device *device); diff --git a/src/amd/vulkan/radv_device.c b/src/amd/vulkan/radv_device.c index 1ae5a2e..738f68d 100644 --- a/src/amd/vulkan/radv_device.c +++ b/src/amd/vulkan/radv_device.c @@ -830,6 +830,7 @@ static const struct debug_control radv_perftest_options[] = {{"localbos", RADV_P {"nosam", RADV_PERFTEST_NO_SAM}, {"sam", RADV_PERFTEST_SAM}, {"rt", RADV_PERFTEST_RT}, + {"nggc", RADV_PERFTEST_NGGC}, {NULL, 0}}; const char * diff --git a/src/amd/vulkan/radv_pipeline.c b/src/amd/vulkan/radv_pipeline.c index 9b08c1b..3dfaa44 100644 --- a/src/amd/vulkan/radv_pipeline.c +++ b/src/amd/vulkan/radv_pipeline.c @@ -211,6 +211,8 @@ get_hash_flags(const struct radv_device *device, bool stats) if (device->instance->debug_flags & RADV_DEBUG_NO_NGG) hash_flags |= RADV_HASH_SHADER_NO_NGG; + if (device->instance->perftest_flags & RADV_PERFTEST_NGGC) + hash_flags |= RADV_HASH_SHADER_FORCE_NGG_CULLING; if (device->physical_device->cs_wave_size == 32) hash_flags |= RADV_HASH_SHADER_CS_WAVE32; if (device->physical_device->ps_wave_size == 32) @@ -3451,8 +3453,11 @@ radv_create_shaders(struct radv_pipeline *pipeline, struct radv_device *device, bool io_to_mem = radv_lower_io_to_mem(device, nir[i], &infos[i], pipeline_key); bool lowered_ngg = pipeline_has_ngg && i == pipeline->graphics.last_vgt_api_stage && !radv_use_llvm_for_stage(device, i); - if (lowered_ngg) - radv_lower_ngg(device, nir[i], &infos[i], pipeline_key, &keys[i]); + if (lowered_ngg) { + uint64_t ps_inputs_read = nir[MESA_SHADER_FRAGMENT] ? nir[MESA_SHADER_FRAGMENT]->info.inputs_read : 0; + bool consider_culling = radv_consider_culling(device, nir[i], ps_inputs_read); + radv_lower_ngg(device, nir[i], &infos[i], pipeline_key, &keys[i], consider_culling); + } radv_optimize_nir_algebraic(nir[i], io_to_mem || lowered_ngg || i == MESA_SHADER_COMPUTE); diff --git a/src/amd/vulkan/radv_private.h b/src/amd/vulkan/radv_private.h index b99bea0..045af48 100644 --- a/src/amd/vulkan/radv_private.h +++ b/src/amd/vulkan/radv_private.h @@ -1672,6 +1672,7 @@ struct radv_event { #define RADV_HASH_SHADER_FORCE_VRS_2x2 (1 << 9) #define RADV_HASH_SHADER_FORCE_VRS_2x1 (1 << 10) #define RADV_HASH_SHADER_FORCE_VRS_1x2 (1 << 11) +#define RADV_HASH_SHADER_FORCE_NGG_CULLING (1 << 13) void radv_hash_shaders(unsigned char *hash, const VkPipelineShaderStageCreateInfo **stages, const struct radv_pipeline_layout *layout, diff --git a/src/amd/vulkan/radv_shader.c b/src/amd/vulkan/radv_shader.c index 5a59e7f..96bdb2c 100644 --- a/src/amd/vulkan/radv_shader.c +++ b/src/amd/vulkan/radv_shader.c @@ -907,10 +907,44 @@ radv_lower_io_to_mem(struct radv_device *device, struct nir_shader *nir, return false; } +bool +radv_consider_culling(struct radv_device *device, struct nir_shader *nir, + uint64_t ps_inputs_read) +{ + /* Culling doesn't make sense for meta shaders. */ + if (!!nir->info.name) + return false; + + /* TODO: enable by default on GFX10.3 when we're confident about performance. */ + bool culling_enabled = device->instance->perftest_flags & RADV_PERFTEST_NGGC; + + if (!culling_enabled) + return false; + + /* Shader based culling efficiency can depend on PS throughput. + * Estimate an upper limit for PS input param count based on GPU info. + */ + unsigned max_ps_params; + unsigned max_render_backends = device->physical_device->rad_info.max_render_backends; + unsigned max_se = device->physical_device->rad_info.max_se; + + if (max_render_backends < 2) + return false; /* Don't use NGG culling on 1 RB chips. */ + else if (max_render_backends / max_se == 4) + max_ps_params = 6; /* Sienna Cichlid and other GFX10.3 dGPUs. */ + else + max_ps_params = 4; /* Navi 1x. */ + + /* TODO: consider other heuristics here, such as PS execution time */ + + return util_bitcount64(ps_inputs_read & ~VARYING_BIT_POS) <= max_ps_params; +} + void radv_lower_ngg(struct radv_device *device, struct nir_shader *nir, struct radv_shader_info *info, const struct radv_pipeline_key *pl_key, - struct radv_shader_variant_key *key) + struct radv_shader_variant_key *key, + bool consider_culling) { /* TODO: support the LLVM backend with the NIR lowering */ assert(!radv_use_llvm_for_stage(device, nir->info.stage)); @@ -930,9 +964,19 @@ void radv_lower_ngg(struct radv_device *device, struct nir_shader *nir, num_vertices_per_prim = 1; else if (nir->info.tess.primitive_mode == GL_ISOLINES) num_vertices_per_prim = 2; + + /* Manually mark the primitive ID used, so the shader can repack it. */ + if (key->vs_common_out.export_prim_id) + BITSET_SET(nir->info.system_values_read, SYSTEM_VALUE_PRIMITIVE_ID); + } else if (nir->info.stage == MESA_SHADER_VERTEX) { /* Need to add 1, because: V_028A6C_POINTLIST=0, V_028A6C_LINESTRIP=1, V_028A6C_TRISTRIP=2, etc. */ num_vertices_per_prim = key->vs.outprim + 1; + + /* Manually mark the instance ID used, so the shader can repack it. */ + if (key->vs.instance_rate_inputs) + BITSET_SET(nir->info.system_values_read, SYSTEM_VALUE_INSTANCE_ID); + } else if (nir->info.stage == MESA_SHADER_GEOMETRY) { num_vertices_per_prim = nir->info.gs.vertices_in; } else { @@ -964,7 +1008,7 @@ void radv_lower_ngg(struct radv_device *device, struct nir_shader *nir, num_vertices_per_prim, max_workgroup_size, info->wave_size, - false, + consider_culling, key->vs_common_out.as_ngg_passthrough, key->vs_common_out.export_prim_id, key->vs.provoking_vtx_last); diff --git a/src/amd/vulkan/radv_shader.h b/src/amd/vulkan/radv_shader.h index 1ad54b9..ab3dcac 100644 --- a/src/amd/vulkan/radv_shader.h +++ b/src/amd/vulkan/radv_shader.h @@ -569,6 +569,10 @@ bool radv_lower_io_to_mem(struct radv_device *device, struct nir_shader *nir, void radv_lower_ngg(struct radv_device *device, struct nir_shader *nir, struct radv_shader_info *info, const struct radv_pipeline_key *pl_key, - struct radv_shader_variant_key *key); + struct radv_shader_variant_key *key, + bool consider_culling); + +bool radv_consider_culling(struct radv_device *device, struct nir_shader *nir, + uint64_t ps_inputs_read); #endif -- 2.7.4