From 96ba0344db8f67acf66b81c3f6cfd6539e2e071a Mon Sep 17 00:00:00 2001 From: Kenneth Graunke Date: Thu, 19 Jan 2023 23:25:20 -0800 Subject: [PATCH] intel: Use common helpers for TCS passthrough shaders Rob added these new helpers a while back, which freedreno and radeonsi both share. We should use them too. The new helpers use variables and system value intrinsics, so we can drop the explicit binding table creation and just use the normal paths. Because we have to rewrite the system value uploading anyway, we drop the scrambling of the default tessellation levels on upload, and instead let the compiler go ahead and remap components like any normal shader. In theory, this results in more shuffling in the shader. In practice, we already do MOVs for message setup. In the passthrough shaders I looked at, this resulted in no extra instructions on Icelake (SIMD8 SINGLE_PATCH) and Tigerlake (8_PATCH). On Haswell, one shader grew by a single instruction for a pittance of cycles in a stage that isn't a performance bottleneck anyway. Avoiding remapping wasn't so much of an optimization as just the way that I originally wrote it. Not worth it. Reviewed-by: Caio Oliveira Part-of: --- src/gallium/drivers/crocus/crocus_program.c | 81 ++++++++++++++--------------- src/gallium/drivers/iris/iris_program.c | 80 ++++++++++++++-------------- src/intel/compiler/brw_nir.c | 57 +++++--------------- 3 files changed, 91 insertions(+), 127 deletions(-) diff --git a/src/gallium/drivers/crocus/crocus_program.c b/src/gallium/drivers/crocus/crocus_program.c index 404c42b..b1de026 100644 --- a/src/gallium/drivers/crocus/crocus_program.c +++ b/src/gallium/drivers/crocus/crocus_program.c @@ -449,6 +449,8 @@ crocus_setup_uniforms(ASSERTED const struct intel_device_info *devinfo, unsigned num_system_values = 0; unsigned patch_vert_idx = -1; + unsigned tess_outer_default_idx = -1; + unsigned tess_inner_default_idx = -1; unsigned ucp_idx[CROCUS_MAX_CLIP_PLANES]; unsigned img_idx[PIPE_MAX_SHADER_IMAGES]; unsigned variable_group_size_idx = -1; @@ -539,6 +541,36 @@ crocus_setup_uniforms(ASSERTED const struct intel_device_info *devinfo, b.cursor = nir_before_instr(instr); offset = nir_imm_int(&b, patch_vert_idx * sizeof(uint32_t)); break; + case nir_intrinsic_load_tess_level_outer_default: + if (tess_outer_default_idx == -1) { + tess_outer_default_idx = num_system_values; + num_system_values += 4; + } + + for (int i = 0; i < 4; i++) { + system_values[tess_outer_default_idx + i] = + BRW_PARAM_BUILTIN_TESS_LEVEL_OUTER_X + i; + } + + b.cursor = nir_before_instr(instr); + offset = + nir_imm_int(&b, tess_outer_default_idx * sizeof(uint32_t)); + break; + case nir_intrinsic_load_tess_level_inner_default: + if (tess_inner_default_idx == -1) { + tess_inner_default_idx = num_system_values; + num_system_values += 2; + } + + for (int i = 0; i < 2; i++) { + system_values[tess_inner_default_idx + i] = + BRW_PARAM_BUILTIN_TESS_LEVEL_INNER_X + i; + } + + b.cursor = nir_before_instr(instr); + offset = + nir_imm_int(&b, tess_inner_default_idx * sizeof(uint32_t)); + break; case nir_intrinsic_image_deref_load_param_intel: { assert(devinfo->ver < 9); nir_deref_instr *deref = nir_src_as_deref(intrin->src[0]); @@ -1420,51 +1452,18 @@ crocus_compile_tcs(struct crocus_context *ice, if (ish) { nir = nir_shader_clone(mem_ctx, ish->nir); - - crocus_setup_uniforms(devinfo, mem_ctx, nir, prog_data, &system_values, - &num_system_values, &num_cbufs); - - crocus_lower_swizzles(nir, &key->base.tex); - crocus_setup_binding_table(devinfo, nir, &bt, /* num_render_targets */ 0, - num_system_values, num_cbufs, &key->base.tex); - if (can_push_ubo(devinfo)) - brw_nir_analyze_ubo_ranges(compiler, nir, NULL, prog_data->ubo_ranges); } else { nir = brw_nir_create_passthrough_tcs(mem_ctx, compiler, key); + } - /* Reserve space for passing the default tess levels as constants. */ - num_cbufs = 1; - num_system_values = 8; - system_values = - rzalloc_array(mem_ctx, enum brw_param_builtin, num_system_values); - prog_data->param = rzalloc_array(mem_ctx, uint32_t, num_system_values); - prog_data->nr_params = num_system_values; - - if (key->_tes_primitive_mode == TESS_PRIMITIVE_QUADS) { - for (int i = 0; i < 4; i++) - system_values[7 - i] = BRW_PARAM_BUILTIN_TESS_LEVEL_OUTER_X + i; - - system_values[3] = BRW_PARAM_BUILTIN_TESS_LEVEL_INNER_X; - system_values[2] = BRW_PARAM_BUILTIN_TESS_LEVEL_INNER_Y; - } else if (key->_tes_primitive_mode == TESS_PRIMITIVE_TRIANGLES) { - for (int i = 0; i < 3; i++) - system_values[7 - i] = BRW_PARAM_BUILTIN_TESS_LEVEL_OUTER_X + i; - - system_values[4] = BRW_PARAM_BUILTIN_TESS_LEVEL_INNER_X; - } else { - assert(key->_tes_primitive_mode == TESS_PRIMITIVE_ISOLINES); - system_values[7] = BRW_PARAM_BUILTIN_TESS_LEVEL_OUTER_Y; - system_values[6] = BRW_PARAM_BUILTIN_TESS_LEVEL_OUTER_X; - } - - /* Manually setup the TCS binding table. */ - memset(&bt, 0, sizeof(bt)); - bt.sizes[CROCUS_SURFACE_GROUP_UBO] = 1; - bt.used_mask[CROCUS_SURFACE_GROUP_UBO] = 1; - bt.size_bytes = 4; + crocus_setup_uniforms(devinfo, mem_ctx, nir, prog_data, &system_values, + &num_system_values, &num_cbufs); - prog_data->ubo_ranges[0].length = 1; - } + crocus_lower_swizzles(nir, &key->base.tex); + crocus_setup_binding_table(devinfo, nir, &bt, /* num_render_targets */ 0, + num_system_values, num_cbufs, &key->base.tex); + if (can_push_ubo(devinfo)) + brw_nir_analyze_ubo_ranges(compiler, nir, NULL, prog_data->ubo_ranges); struct brw_tcs_prog_key key_clean = *key; crocus_sanitize_tex_key(&key_clean.base.tex); diff --git a/src/gallium/drivers/iris/iris_program.c b/src/gallium/drivers/iris/iris_program.c index f168d2d..e899734 100644 --- a/src/gallium/drivers/iris/iris_program.c +++ b/src/gallium/drivers/iris/iris_program.c @@ -482,6 +482,8 @@ iris_setup_uniforms(ASSERTED const struct intel_device_info *devinfo, unsigned num_system_values = 0; unsigned patch_vert_idx = -1; + unsigned tess_outer_default_idx = -1; + unsigned tess_inner_default_idx = -1; unsigned ucp_idx[IRIS_MAX_CLIP_PLANES]; unsigned img_idx[PIPE_MAX_SHADER_IMAGES]; unsigned variable_group_size_idx = -1; @@ -581,6 +583,36 @@ iris_setup_uniforms(ASSERTED const struct intel_device_info *devinfo, offset = nir_imm_int(&b, system_values_start + patch_vert_idx * sizeof(uint32_t)); break; + case nir_intrinsic_load_tess_level_outer_default: + if (tess_outer_default_idx == -1) { + tess_outer_default_idx = num_system_values; + num_system_values += 4; + } + + for (int i = 0; i < 4; i++) { + system_values[tess_outer_default_idx + i] = + BRW_PARAM_BUILTIN_TESS_LEVEL_OUTER_X + i; + } + + b.cursor = nir_before_instr(instr); + offset = nir_imm_int(&b, system_values_start + + tess_outer_default_idx * sizeof(uint32_t)); + break; + case nir_intrinsic_load_tess_level_inner_default: + if (tess_inner_default_idx == -1) { + tess_inner_default_idx = num_system_values; + num_system_values += 2; + } + + for (int i = 0; i < 2; i++) { + system_values[tess_inner_default_idx + i] = + BRW_PARAM_BUILTIN_TESS_LEVEL_INNER_X + i; + } + + b.cursor = nir_before_instr(instr); + offset = nir_imm_int(&b, system_values_start + + tess_inner_default_idx * sizeof(uint32_t)); + break; case nir_intrinsic_image_deref_load_param_intel: { assert(devinfo->ver < 9); nir_deref_instr *deref = nir_src_as_deref(intrin->src[0]); @@ -1530,50 +1562,16 @@ iris_compile_tcs(struct iris_screen *screen, if (ish) { nir = nir_shader_clone(mem_ctx, ish->nir); - - iris_setup_uniforms(devinfo, mem_ctx, nir, prog_data, 0, &system_values, - &num_system_values, &num_cbufs); - iris_setup_binding_table(devinfo, nir, &bt, /* num_render_targets */ 0, - num_system_values, num_cbufs); - brw_nir_analyze_ubo_ranges(compiler, nir, NULL, prog_data->ubo_ranges); } else { - nir = - brw_nir_create_passthrough_tcs(mem_ctx, compiler, &brw_key); - - /* Reserve space for passing the default tess levels as constants. */ - num_cbufs = 1; - num_system_values = 8; - system_values = - rzalloc_array(mem_ctx, enum brw_param_builtin, num_system_values); - prog_data->param = rzalloc_array(mem_ctx, uint32_t, num_system_values); - prog_data->nr_params = num_system_values; - - if (key->_tes_primitive_mode == TESS_PRIMITIVE_QUADS) { - for (int i = 0; i < 4; i++) - system_values[7 - i] = BRW_PARAM_BUILTIN_TESS_LEVEL_OUTER_X + i; - - system_values[3] = BRW_PARAM_BUILTIN_TESS_LEVEL_INNER_X; - system_values[2] = BRW_PARAM_BUILTIN_TESS_LEVEL_INNER_Y; - } else if (key->_tes_primitive_mode == TESS_PRIMITIVE_TRIANGLES) { - for (int i = 0; i < 3; i++) - system_values[7 - i] = BRW_PARAM_BUILTIN_TESS_LEVEL_OUTER_X + i; - - system_values[4] = BRW_PARAM_BUILTIN_TESS_LEVEL_INNER_X; - } else { - assert(key->_tes_primitive_mode == TESS_PRIMITIVE_ISOLINES); - system_values[7] = BRW_PARAM_BUILTIN_TESS_LEVEL_OUTER_Y; - system_values[6] = BRW_PARAM_BUILTIN_TESS_LEVEL_OUTER_X; - } - - /* Manually setup the TCS binding table. */ - memset(&bt, 0, sizeof(bt)); - bt.sizes[IRIS_SURFACE_GROUP_UBO] = 1; - bt.used_mask[IRIS_SURFACE_GROUP_UBO] = 1; - bt.size_bytes = 4; - - prog_data->ubo_ranges[0].length = 1; + nir = brw_nir_create_passthrough_tcs(mem_ctx, compiler, &brw_key); } + iris_setup_uniforms(devinfo, mem_ctx, nir, prog_data, 0, &system_values, + &num_system_values, &num_cbufs); + iris_setup_binding_table(devinfo, nir, &bt, /* num_render_targets */ 0, + num_system_values, num_cbufs); + brw_nir_analyze_ubo_ranges(compiler, nir, NULL, prog_data->ubo_ranges); + struct brw_compile_tcs_params params = { .nir = nir, .key = &brw_key, diff --git a/src/intel/compiler/brw_nir.c b/src/intel/compiler/brw_nir.c index 76c2ca4..c2ac6be 100644 --- a/src/intel/compiler/brw_nir.c +++ b/src/intel/compiler/brw_nir.c @@ -189,9 +189,6 @@ remap_patch_urb_offsets(nir_block *block, nir_builder *b, const struct brw_vue_map *vue_map, enum tess_primitive_mode tes_primitive_mode) { - const bool is_passthrough_tcs = b->shader->info.name && - strcmp(b->shader->info.name, "passthrough TCS") == 0; - nir_foreach_instr_safe(instr, block) { if (instr->type != nir_instr_type_intrinsic) continue; @@ -203,8 +200,7 @@ remap_patch_urb_offsets(nir_block *block, nir_builder *b, if ((stage == MESA_SHADER_TESS_CTRL && is_output(intrin)) || (stage == MESA_SHADER_TESS_EVAL && is_input(intrin))) { - if (!is_passthrough_tcs && - remap_tess_levels(b, intrin, tes_primitive_mode)) + if (remap_tess_levels(b, intrin, tes_primitive_mode)) continue; int vue_slot = vue_map->varying_to_slot[intrin->const_index[0]]; @@ -1858,50 +1854,21 @@ brw_nir_create_passthrough_tcs(void *mem_ctx, const struct brw_compiler *compile { const nir_shader_compiler_options *options = compiler->nir_options[MESA_SHADER_TESS_CTRL]; - nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_TESS_CTRL, - options, "passthrough TCS"); - ralloc_steal(mem_ctx, b.shader); - nir_shader *nir = b.shader; - nir_variable *var; - nir_ssa_def *load; - nir_ssa_def *zero = nir_imm_int(&b, 0); - nir_ssa_def *invoc_id = nir_load_invocation_id(&b); - - nir->info.inputs_read = key->outputs_written & - ~(VARYING_BIT_TESS_LEVEL_INNER | VARYING_BIT_TESS_LEVEL_OUTER); - nir->info.outputs_written = key->outputs_written; - nir->info.tess.tcs_vertices_out = key->input_vertices; - nir->num_uniforms = 8 * sizeof(uint32_t); - - var = nir_variable_create(nir, nir_var_uniform, glsl_vec4_type(), "hdr_0"); - var->data.location = 0; - var = nir_variable_create(nir, nir_var_uniform, glsl_vec4_type(), "hdr_1"); - var->data.location = 1; - - /* Write the patch URB header. */ - for (int i = 0; i <= 1; i++) { - load = nir_load_uniform(&b, 4, 32, zero, .base = i * 4 * sizeof(uint32_t)); - - nir_store_output(&b, load, zero, - .base = VARYING_SLOT_TESS_LEVEL_INNER - i, - .write_mask = WRITEMASK_XYZW); - } - - /* Copy inputs to outputs. */ - uint64_t varyings = nir->info.inputs_read; - while (varyings != 0) { - const int varying = ffsll(varyings) - 1; - - load = nir_load_per_vertex_input(&b, 4, 32, invoc_id, zero, .base = varying); + uint64_t inputs_read = key->outputs_written & + ~(VARYING_BIT_TESS_LEVEL_INNER | VARYING_BIT_TESS_LEVEL_OUTER); - nir_store_per_vertex_output(&b, load, invoc_id, zero, - .base = varying, - .write_mask = WRITEMASK_XYZW); + unsigned locations[64]; + unsigned num_locations = 0; - varyings &= ~BITFIELD64_BIT(varying); - } + u_foreach_bit64(varying, inputs_read) + locations[num_locations++] = varying; + nir_shader *nir = + nir_create_passthrough_tcs_impl(options, locations, num_locations, + key->input_vertices); + nir->info.inputs_read = inputs_read; + nir->info.tess._primitive_mode = key->_tes_primitive_mode; nir_validate_shader(nir, "in brw_nir_create_passthrough_tcs"); struct brw_nir_compiler_opts opts = {}; -- 2.7.4