From 429ef02f83e6516ec984caefba5046c939c6b8ee Mon Sep 17 00:00:00 2001 From: Lionel Landwerlin Date: Sat, 8 Apr 2023 21:34:35 +0300 Subject: [PATCH] intel/fs: make tcs input_vertices dynamic We need to do 3 things to accomplish this : 1. make all the register access consider the maximal case when unknown at compile time 2. move the clamping of load_per_vertex_input prior to lowering nir_intrinsic_load_patch_vertices_in (in the dynamic cases, the clamping will use the nir_intrinsic_load_patch_vertices_in to clamp), meaning clamping using derefs rather than lowered nir_intrinsic_load_per_vertex_input 3. in the known cases, lower nir_intrinsic_load_patch_vertices_in in NIR (so that the clamped elements still be vectorized to the smallest number of URB read messages) Signed-off-by: Lionel Landwerlin Reviewed-by: Emma Anholt Part-of: --- src/intel/compiler/brw_compiler.h | 10 +++ src/intel/compiler/brw_fs_nir.cpp | 8 +-- src/intel/compiler/brw_fs_thread_payload.cpp | 4 +- src/intel/compiler/brw_nir.c | 10 +++ src/intel/compiler/brw_nir.h | 8 ++- .../compiler/brw_nir_clamp_per_vertex_loads.c | 75 +++++++++++++++++++--- src/intel/compiler/brw_vec4_tcs.cpp | 4 +- 7 files changed, 97 insertions(+), 22 deletions(-) diff --git a/src/intel/compiler/brw_compiler.h b/src/intel/compiler/brw_compiler.h index f456acc..2981bd9 100644 --- a/src/intel/compiler/brw_compiler.h +++ b/src/intel/compiler/brw_compiler.h @@ -322,6 +322,7 @@ struct brw_tcs_prog_key enum tess_primitive_mode _tes_primitive_mode; + /** Number of input vertices, 0 means dynamic */ unsigned input_vertices; /** A bitfield of per-patch outputs written. */ @@ -331,6 +332,15 @@ struct brw_tcs_prog_key uint32_t padding:24; }; +#define BRW_MAX_TCS_INPUT_VERTICES (32) + +static inline uint32_t +brw_tcs_prog_key_input_vertices(const struct brw_tcs_prog_key *key) +{ + return key->input_vertices != 0 ? + key->input_vertices : BRW_MAX_TCS_INPUT_VERTICES; +} + /** The program key for Tessellation Evaluation Shaders. */ struct brw_tes_prog_key { diff --git a/src/intel/compiler/brw_fs_nir.cpp b/src/intel/compiler/brw_fs_nir.cpp index fa98909..be24207 100644 --- a/src/intel/compiler/brw_fs_nir.cpp +++ b/src/intel/compiler/brw_fs_nir.cpp @@ -2665,7 +2665,8 @@ fs_visitor::get_tcs_multi_patch_icp_handle(const fs_builder &bld, * we might read up to nir->info.gs.vertices_in registers. */ bld.emit(SHADER_OPCODE_MOV_INDIRECT, icp_handle, start, - icp_offset_bytes, brw_imm_ud(tcs_key->input_vertices * REG_SIZE)); + icp_offset_bytes, + brw_imm_ud(brw_tcs_prog_key_input_vertices(tcs_key) * REG_SIZE)); return icp_handle; } @@ -2675,7 +2676,6 @@ fs_visitor::nir_emit_tcs_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr) { assert(stage == MESA_SHADER_TESS_CTRL); - struct brw_tcs_prog_key *tcs_key = (struct brw_tcs_prog_key *) key; struct brw_tcs_prog_data *tcs_prog_data = brw_tcs_prog_data(prog_data); struct brw_vue_prog_data *vue_prog_data = &tcs_prog_data->base; @@ -2690,10 +2690,6 @@ fs_visitor::nir_emit_tcs_intrinsic(const fs_builder &bld, case nir_intrinsic_load_invocation_id: bld.MOV(retype(dst, invocation_id.type), invocation_id); break; - case nir_intrinsic_load_patch_vertices_in: - bld.MOV(retype(dst, BRW_REGISTER_TYPE_D), - brw_imm_d(tcs_key->input_vertices)); - break; case nir_intrinsic_scoped_barrier: if (nir_intrinsic_memory_scope(instr) != NIR_SCOPE_NONE) diff --git a/src/intel/compiler/brw_fs_thread_payload.cpp b/src/intel/compiler/brw_fs_thread_payload.cpp index 8b78114..0901d10 100644 --- a/src/intel/compiler/brw_fs_thread_payload.cpp +++ b/src/intel/compiler/brw_fs_thread_payload.cpp @@ -48,7 +48,7 @@ tcs_thread_payload::tcs_thread_payload(const fs_visitor &v) num_regs = 5; } else { assert(vue_prog_data->dispatch_mode == DISPATCH_MODE_TCS_MULTI_PATCH); - assert(tcs_key->input_vertices > 0); + assert(tcs_key->input_vertices <= BRW_MAX_TCS_INPUT_VERTICES); patch_urb_output = brw_ud8_grf(1, 0); @@ -59,7 +59,7 @@ tcs_thread_payload::tcs_thread_payload(const fs_visitor &v) /* ICP handles occupy the next 1-32 registers. */ icp_handle_start = brw_ud8_grf(r, 0); - r += tcs_key->input_vertices; + r += brw_tcs_prog_key_input_vertices(tcs_key); num_regs = r; } diff --git a/src/intel/compiler/brw_nir.c b/src/intel/compiler/brw_nir.c index 31168b4..436a088 100644 --- a/src/intel/compiler/brw_nir.c +++ b/src/intel/compiler/brw_nir.c @@ -1049,6 +1049,14 @@ brw_preprocess_nir(const struct brw_compiler *compiler, nir_shader *nir, nir_var_mem_ubo | nir_var_mem_ssbo, nir_lower_direct_array_deref_of_vec_load); + /* Clamp load_per_vertex_input of the TCS stage so that we do not generate + * loads reading out of bounds. We can do this here because we called + * nir_lower_system_values above. + */ + if (nir->info.stage == MESA_SHADER_TESS_CTRL && + compiler->use_tcs_multi_patch) + OPT(brw_nir_clamp_per_vertex_loads); + /* Get rid of split copies */ brw_nir_optimize(nir, compiler); } @@ -1985,6 +1993,8 @@ nir_shader * brw_nir_create_passthrough_tcs(void *mem_ctx, const struct brw_compiler *compiler, const struct brw_tcs_prog_key *key) { + assert(key->input_vertices > 0); + const nir_shader_compiler_options *options = compiler->nir_options[MESA_SHADER_TESS_CTRL]; diff --git a/src/intel/compiler/brw_nir.h b/src/intel/compiler/brw_nir.h index 6700453..e19c3c9 100644 --- a/src/intel/compiler/brw_nir.h +++ b/src/intel/compiler/brw_nir.h @@ -98,6 +98,9 @@ struct brw_nir_compiler_opts { /* Whether robust image access is enabled */ bool robust_image_access; + + /* Input vertices for TCS stage (0 means dynamic) */ + unsigned input_vertices; }; void brw_preprocess_nir(const struct brw_compiler *compiler, @@ -191,8 +194,9 @@ bool brw_nir_opt_peephole_ffma(nir_shader *shader); bool brw_nir_opt_peephole_imul32x16(nir_shader *shader); -bool brw_nir_clamp_per_vertex_loads(nir_shader *shader, - unsigned input_vertices); +bool brw_nir_clamp_per_vertex_loads(nir_shader *shader); + +bool brw_nir_lower_patch_vertices_in(nir_shader *shader, unsigned input_vertices); bool brw_nir_blockify_uniform_loads(nir_shader *shader, const struct intel_device_info *devinfo); diff --git a/src/intel/compiler/brw_nir_clamp_per_vertex_loads.c b/src/intel/compiler/brw_nir_clamp_per_vertex_loads.c index 599de48..afabe3f 100644 --- a/src/intel/compiler/brw_nir_clamp_per_vertex_loads.c +++ b/src/intel/compiler/brw_nir_clamp_per_vertex_loads.c @@ -24,38 +24,93 @@ /* * Limit input per vertex input accesses. This is useful for the tesselation stages. * On Gfx12.5+ out of bound accesses generate hangs. + * + * This pass operates on derefs, it must be called before shader inputs are + * lowered. */ #include "brw_nir.h" #include "compiler/nir/nir_builder.h" +#include "compiler/nir/nir_deref.h" static bool -lower_instr(nir_builder *b, nir_instr *instr, void *cb_data) +clamp_per_vertex_loads_instr(nir_builder *b, nir_instr *instr, void *cb_data) { - unsigned *input_vertices = cb_data; + if (instr->type != nir_instr_type_intrinsic) + return false; + + nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); + if (intrin->intrinsic != nir_intrinsic_load_deref) + return false; + + nir_deref_instr *deref = nir_instr_as_deref(intrin->src[0].ssa->parent_instr); + nir_variable *var = nir_deref_instr_get_variable(deref); + if (var == NULL || (var->data.mode & nir_var_shader_in) == 0) + return false; + + nir_deref_path path; + nir_deref_path_init(&path, deref, cb_data); + + bool progress = false; + for (unsigned i = 0; path.path[i]; i++) { + if (path.path[i]->deref_type != nir_deref_type_array) + continue; + + b->cursor = nir_before_instr(&path.path[i]->instr); + + nir_instr_rewrite_src_ssa(&path.path[i]->instr, + &path.path[i]->arr.index, + nir_umin(b, + path.path[i]->arr.index.ssa, + nir_iadd_imm(b, nir_load_patch_vertices_in(b), -1))); + + progress = true; + break; + } + nir_deref_path_finish(&path); + + return progress; +} + +bool +brw_nir_clamp_per_vertex_loads(nir_shader *shader) +{ + void *mem_ctx = ralloc_context(NULL); + + bool ret = nir_shader_instructions_pass(shader, clamp_per_vertex_loads_instr, + nir_metadata_block_index | + nir_metadata_dominance, + mem_ctx); + + ralloc_free(mem_ctx); + + return ret; +} + +static bool +lower_patch_vertices_instr(nir_builder *b, nir_instr *instr, void *cb_data) +{ if (instr->type != nir_instr_type_intrinsic) return false; nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); - if (intrin->intrinsic != nir_intrinsic_load_per_vertex_input) + if (intrin->intrinsic != nir_intrinsic_load_patch_vertices_in) return false; + unsigned *input_vertices = cb_data; + b->cursor = nir_before_instr(instr); - nir_instr_rewrite_src_ssa(instr, - &intrin->src[0], - nir_imin(b, intrin->src[0].ssa, - nir_imm_int(b, *input_vertices - 1))); + nir_ssa_def_rewrite_uses(&intrin->dest.ssa, nir_imm_int(b, *input_vertices)); return true; } bool -brw_nir_clamp_per_vertex_loads(nir_shader *shader, - unsigned input_vertices) +brw_nir_lower_patch_vertices_in(nir_shader *shader, unsigned input_vertices) { - return nir_shader_instructions_pass(shader, lower_instr, + return nir_shader_instructions_pass(shader, lower_patch_vertices_instr, nir_metadata_block_index | nir_metadata_dominance, &input_vertices); diff --git a/src/intel/compiler/brw_vec4_tcs.cpp b/src/intel/compiler/brw_vec4_tcs.cpp index 8d60eee..9e9c147 100644 --- a/src/intel/compiler/brw_vec4_tcs.cpp +++ b/src/intel/compiler/brw_vec4_tcs.cpp @@ -385,8 +385,8 @@ brw_compile_tcs(const struct brw_compiler *compiler, key->_tes_primitive_mode); if (key->quads_workaround) brw_nir_apply_tcs_quads_workaround(nir); - if (compiler->use_tcs_multi_patch) - brw_nir_clamp_per_vertex_loads(nir, key->input_vertices); + if (key->input_vertices > 0) + brw_nir_lower_patch_vertices_in(nir, key->input_vertices); brw_postprocess_nir(nir, compiler, debug_enabled, key->base.robust_buffer_access); -- 2.7.4