radv: fix dynamic RT stack size with VGPR spilling
authorRhys Perry <pendingchaos02@gmail.com>
Mon, 14 Feb 2022 19:23:15 +0000 (19:23 +0000)
committerMarge Bot <emma+marge@anholt.net>
Tue, 20 Sep 2022 01:39:20 +0000 (01:39 +0000)
VGPR spilling might cause VGPRs to be spilled at scratch offset 0, so we
can't use that.

fossil-db (Sienna Cichlid, Q2RTX and Control):
Totals from 4 (0.26% of 1524) affected shaders:
Instrs: 8734 -> 8737 (+0.03%)
CodeSize: 48492 -> 48504 (+0.02%)
Latency: 384375 -> 384369 (-0.00%)
InvThroughput: 256250 -> 256246 (-0.00%)
Copies: 1312 -> 1313 (+0.08%)
Branches: 256 -> 258 (+0.78%)

Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Konstantin Seurer <konstantin.seurer@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/18541>

src/amd/common/ac_shader_args.h
src/amd/compiler/aco_instruction_selection.cpp
src/amd/vulkan/radv_cmd_buffer.c
src/amd/vulkan/radv_pipeline_rt.c
src/amd/vulkan/radv_shader.h
src/amd/vulkan/radv_shader_args.c
src/amd/vulkan/radv_shader_info.c
src/compiler/nir/nir_divergence_analysis.c
src/compiler/nir/nir_intrinsics.py

index f50c350..ae9be30 100644 (file)
@@ -152,6 +152,7 @@ struct ac_shader_args {
    struct ac_arg sbt_descriptors;
    struct ac_arg ray_launch_size_addr;
    struct ac_arg force_vrs_rates;
+   struct ac_arg rt_dynamic_callable_stack_base;
 };
 
 void ac_add_arg(struct ac_shader_args *info, enum ac_arg_regfile regfile, unsigned registers,
index 9b564d6..3c156e6 100644 (file)
@@ -9167,6 +9167,10 @@ visit_intrinsic(isel_context* ctx, nir_intrinsic_instr* instr)
       break;
    }
    case nir_intrinsic_bvh64_intersect_ray_amd: visit_bvh64_intersect_ray_amd(ctx, instr); break;
+   case nir_intrinsic_load_rt_dynamic_callable_stack_base_amd:
+      bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
+               get_arg(ctx, ctx->args->ac.rt_dynamic_callable_stack_base));
+      break;
    case nir_intrinsic_overwrite_vs_arguments_amd: {
       ctx->arg_temps[ctx->args->ac.vertex_id.arg_index] = get_ssa_temp(ctx, instr->src[0].ssa);
       ctx->arg_temps[ctx->args->ac.instance_id.arg_index] = get_ssa_temp(ctx, instr->src[1].ssa);
index 362acef..573a3e7 100644 (file)
@@ -8559,6 +8559,14 @@ radv_trace_rays(struct radv_cmd_buffer *cmd_buffer, const VkTraceRaysIndirectCom
                                base_reg + size_loc->sgpr_idx * 4, launch_size_va, true);
    }
 
+   struct radv_userdata_info *base_loc = radv_lookup_user_sgpr(
+      &pipeline->base, MESA_SHADER_COMPUTE, AC_UD_CS_RAY_DYNAMIC_CALLABLE_STACK_BASE);
+   if (base_loc->sgpr_idx != -1) {
+      struct radv_shader_info *cs_info = &pipeline->base.shaders[MESA_SHADER_COMPUTE]->info;
+      radeon_set_sh_reg(cmd_buffer->cs, R_00B900_COMPUTE_USER_DATA_0 + base_loc->sgpr_idx * 4,
+                        pipeline->base.scratch_bytes_per_wave / cs_info->wave_size);
+   }
+
    radv_dispatch(cmd_buffer, &info, pipeline, VK_PIPELINE_BIND_POINT_RAY_TRACING_KHR);
 }
 
index 7c5443c..a128bd9 100644 (file)
@@ -1769,7 +1769,10 @@ create_rt_shader(struct radv_device *device, const VkRayTracingPipelineCreateInf
 
    struct rt_variables vars = create_rt_variables(b.shader, pCreateInfo, stack_sizes);
    load_sbt_entry(&b, &vars, nir_imm_int(&b, 0), SBT_RAYGEN, 0);
-   nir_store_var(&b, vars.stack_ptr, nir_imm_int(&b, 0), 0x1);
+   if (radv_rt_pipeline_has_dynamic_stack_size(pCreateInfo))
+      nir_store_var(&b, vars.stack_ptr, nir_load_rt_dynamic_callable_stack_base_amd(&b), 0x1);
+   else
+      nir_store_var(&b, vars.stack_ptr, nir_imm_int(&b, 0), 0x1);
 
    nir_store_var(&b, vars.main_loop_case_visited, nir_imm_bool(&b, true), 1);
 
index b889bc4..d81aafc 100644 (file)
@@ -153,6 +153,7 @@ enum radv_ud_index {
    AC_UD_CS_GRID_SIZE = AC_UD_SHADER_START,
    AC_UD_CS_SBT_DESCRIPTORS,
    AC_UD_CS_RAY_LAUNCH_SIZE_ADDR,
+   AC_UD_CS_RAY_DYNAMIC_CALLABLE_STACK_BASE,
    AC_UD_CS_TASK_RING_OFFSETS,
    AC_UD_CS_TASK_DRAW_ID,
    AC_UD_CS_TASK_IB,
@@ -345,6 +346,7 @@ struct radv_shader_info {
 
       bool uses_sbt;
       bool uses_ray_launch_size;
+      bool uses_dynamic_rt_callable_stack;
    } cs;
    struct {
       uint64_t tes_inputs_read;
index 058fbbd..84c316f 100644 (file)
@@ -189,6 +189,8 @@ allocate_user_sgprs(enum amd_gfx_level gfx_level, const struct radv_shader_info
          user_sgpr_count += args->load_grid_size_from_user_sgpr ? 3 : 2;
       if (info->cs.uses_ray_launch_size)
          user_sgpr_count += 2;
+      if (info->cs.uses_dynamic_rt_callable_stack)
+         user_sgpr_count += 1;
       if (info->vs.needs_draw_id)
          user_sgpr_count += 1;
       if (stage == MESA_SHADER_TASK)
@@ -605,6 +607,11 @@ radv_declare_shader_args(enum amd_gfx_level gfx_level, const struct radv_pipelin
          ac_add_arg(&args->ac, AC_ARG_SGPR, 2, AC_ARG_CONST_PTR, &args->ac.ray_launch_size_addr);
       }
 
+      if (info->cs.uses_dynamic_rt_callable_stack) {
+         ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT,
+                    &args->ac.rt_dynamic_callable_stack_base);
+      }
+
       if (info->vs.needs_draw_id) {
          ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.draw_id);
       }
@@ -872,6 +879,9 @@ radv_declare_shader_args(enum amd_gfx_level gfx_level, const struct radv_pipelin
       if (args->ac.ray_launch_size_addr.used) {
          set_loc_shader_ptr(args, AC_UD_CS_RAY_LAUNCH_SIZE_ADDR, &user_sgpr_idx);
       }
+      if (args->ac.rt_dynamic_callable_stack_base.used) {
+         set_loc_shader(args, AC_UD_CS_RAY_DYNAMIC_CALLABLE_STACK_BASE, &user_sgpr_idx, 1);
+      }
       if (args->ac.draw_id.used) {
          set_loc_shader(args, AC_UD_CS_TASK_DRAW_ID, &user_sgpr_idx, 1);
       }
index 42260d9..336ca1c 100644 (file)
@@ -210,6 +210,9 @@ gather_intrinsic_info(const nir_shader *nir, const nir_intrinsic_instr *instr,
    case nir_intrinsic_load_force_vrs_rates_amd:
       info->force_vrs_per_vertex = true;
       break;
+   case nir_intrinsic_load_rt_dynamic_callable_stack_base_amd:
+      info->cs.uses_dynamic_rt_callable_stack = true;
+      break;
    default:
       break;
    }
index 867cbb5..3c34f5c 100644 (file)
@@ -173,6 +173,7 @@ visit_intrinsic(nir_shader *shader, nir_intrinsic_instr *instr)
    case nir_intrinsic_load_tess_level_outer_default:
    case nir_intrinsic_load_scalar_arg_amd:
    case nir_intrinsic_load_smem_amd:
+   case nir_intrinsic_load_rt_dynamic_callable_stack_base_amd:
    case nir_intrinsic_load_global_const_block_intel:
    case nir_intrinsic_load_reloc_const_intel:
    case nir_intrinsic_load_global_block_intel:
index 71ae27b..081db81 100644 (file)
@@ -1398,6 +1398,9 @@ system_value("intersection_opaque_amd", 1, bit_sizes=[1])
 # Used for indirect ray tracing.
 system_value("ray_launch_size_addr_amd", 1, bit_sizes=[64])
 
+# Scratch base of callable stack for ray tracing.
+system_value("rt_dynamic_callable_stack_base_amd", 1)
+
 # Load forced VRS rates.
 intrinsic("load_force_vrs_rates_amd", dest_comp=1, bit_sizes=[32], flags=[CAN_ELIMINATE, CAN_REORDER])