From 0f2491cbddf80a4c95df5d57ae438b130288f013 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Mon, 13 Mar 2023 00:18:47 -0400 Subject: [PATCH] nir: add dual-slot input information into load_input intrinsics This is necessary to allow optimizing VS inputs after nir_lower_io, which is currently impossible because the loss of dual-slot information in NIR would break VS inputs. With this, driver locations can be recomputed by calling nir_recompute_io_bases. It's a prerequisite for optimizing varyings with lowered IO. When this is used, we will be able to eliminate unused dual-slot VS inputs as well as unused low and high halves of dual-slot VS inputs for the first time, which can happen due to optimizations of varyings. Without this, st/mesa binds vertex buffers for dual-slot inputs that are fully or partially unused in the shader. Reviewed-By: Mike Blumenkrantz Part-of: --- src/compiler/nir/nir.h | 21 ++++++++++++++++++- src/compiler/nir/nir_lower_io.c | 46 ++++++++++++++++++++++++++++++++++------- src/compiler/nir/nir_print.c | 3 +++ 3 files changed, 61 insertions(+), 9 deletions(-) diff --git a/src/compiler/nir/nir.h b/src/compiler/nir/nir.h index 2d17343..6012d20 100644 --- a/src/compiler/nir/nir.h +++ b/src/compiler/nir/nir.h @@ -1847,6 +1847,7 @@ typedef struct nir_io_semantics { unsigned per_view : 1; unsigned high_16bits : 1; /* whether accessing low or high half of the slot */ unsigned invariant : 1; /* The variable has the invariant flag set */ + unsigned high_dvec2 : 1; /* whether accessing the high half of dvec3/dvec4 */ /* CLIP_DISTn, LAYER, VIEWPORT, and TESS_LEVEL_* have up to 3 uses: * - an output consumed by the next stage * - a system value output affecting fixed-func hardware, e.g. the clipper @@ -1857,7 +1858,7 @@ typedef struct nir_io_semantics { unsigned no_varying : 1; /* whether this output isn't consumed by the next stage */ unsigned no_sysval_output : 1; /* whether this system value output has no effect due to current pipeline states */ - unsigned _pad : 3; + unsigned _pad : 2; } nir_io_semantics; /* Transform feedback info for 2 outputs. nir_intrinsic_store_output contains @@ -4997,13 +4998,31 @@ typedef enum { /* If set, this causes all 64-bit IO operations to be lowered on-the-fly * to 32-bit operations. This is only valid for nir_var_shader_in/out * modes. + * + * Note that this destroys dual-slot information i.e. whether an input + * occupies the low or high half of dvec4. Instead, it adds an offset of 1 + * to the load (which is ambiguous) and expects driver locations of inputs + * to be final, which prevents any further optimizations. + * + * TODO: remove this in favor of nir_lower_io_lower_64bit_to_32_new. */ nir_lower_io_lower_64bit_to_32 = (1 << 0), + /* If set, this causes the subset of 64-bit IO operations involving floats to be lowered on-the-fly * to 32-bit operations. This is only valid for nir_var_shader_in/out * modes. */ nir_lower_io_lower_64bit_float_to_32 = (1 << 1), + + /* This causes all 64-bit IO operations to be lowered to 32-bit operations. + * This is only valid for nir_var_shader_in/out modes. + * + * Only VS inputs: Dual slot information is preserved as nir_io_semantics:: + * high_dvec2 and gathered into shader_info::dual_slot_inputs, so that + * the shader can be arbitrarily optimized and the low or high half of + * dvec4 can be DCE'd independently without affecting the other half. + */ + nir_lower_io_lower_64bit_to_32_new = (1 << 2), } nir_lower_io_options; bool nir_lower_io(nir_shader *shader, nir_variable_mode modes, diff --git a/src/compiler/nir/nir_lower_io.c b/src/compiler/nir/nir_lower_io.c index 8ab6e51..107cb87 100644 --- a/src/compiler/nir/nir_lower_io.c +++ b/src/compiler/nir/nir_lower_io.c @@ -158,6 +158,16 @@ nir_is_arrayed_io(const nir_variable *var, gl_shader_stage stage) return false; } +static bool +uses_high_dvec2_semantic(struct lower_io_state *state, + const nir_variable *var) +{ + return state->builder.shader->info.stage == MESA_SHADER_VERTEX && + state->options & nir_lower_io_lower_64bit_to_32_new && + var->data.mode == nir_var_shader_in && + glsl_type_is_dual_slot(glsl_without_array(var->type)); +} + static unsigned get_number_of_slots(struct lower_io_state *state, const nir_variable *var) @@ -181,7 +191,8 @@ get_number_of_slots(struct lower_io_state *state, !nir_is_arrayed_io(var, state->builder.shader->info.stage)) return 1; - return state->type_size(type, var->data.bindless); + return state->type_size(type, var->data.bindless) / + (uses_high_dvec2_semantic(state, var) ? 2 : 1); } static nir_def * @@ -251,7 +262,7 @@ static nir_def * emit_load(struct lower_io_state *state, nir_def *array_index, nir_variable *var, nir_def *offset, unsigned component, unsigned num_components, unsigned bit_size, - nir_alu_type dest_type) + nir_alu_type dest_type, bool high_dvec2) { nir_builder *b = &state->builder; const nir_shader *nir = b->shader; @@ -324,6 +335,7 @@ emit_load(struct lower_io_state *state, semantics.medium_precision = var->data.precision == GLSL_PRECISION_MEDIUM || var->data.precision == GLSL_PRECISION_LOW; + semantics.high_dvec2 = high_dvec2; nir_intrinsic_set_io_semantics(load, semantics); } @@ -350,14 +362,23 @@ lower_load(nir_intrinsic_instr *intrin, struct lower_io_state *state, { const bool lower_double = !glsl_type_is_integer(type) && state->options & nir_lower_io_lower_64bit_float_to_32; if (intrin->def.bit_size == 64 && - (lower_double || (state->options & nir_lower_io_lower_64bit_to_32))) { + (lower_double || (state->options & (nir_lower_io_lower_64bit_to_32_new | + nir_lower_io_lower_64bit_to_32)))) { nir_builder *b = &state->builder; + bool use_high_dvec2_semantic = uses_high_dvec2_semantic(state, var); + + /* Each slot is a dual slot, so divide the offset within the variable + * by 2. + */ + if (use_high_dvec2_semantic) + offset = nir_ushr_imm(b, offset, 1); const unsigned slot_size = state->type_size(glsl_dvec_type(2), false); nir_def *comp64[4]; assert(component == 0 || component == 2); unsigned dest_comp = 0; + bool high_dvec2 = false; while (dest_comp < intrin->def.num_components) { const unsigned num_comps = MIN2(intrin->def.num_components - dest_comp, @@ -365,7 +386,7 @@ lower_load(nir_intrinsic_instr *intrin, struct lower_io_state *state, nir_def *data32 = emit_load(state, array_index, var, offset, component, - num_comps * 2, 32, nir_type_uint32); + num_comps * 2, 32, nir_type_uint32, high_dvec2); for (unsigned i = 0; i < num_comps; i++) { comp64[dest_comp + i] = nir_pack_64_2x32(b, nir_channels(b, data32, 3 << (i * 2))); @@ -374,7 +395,15 @@ lower_load(nir_intrinsic_instr *intrin, struct lower_io_state *state, /* Only the first store has a component offset */ component = 0; dest_comp += num_comps; - offset = nir_iadd_imm(b, offset, slot_size); + + if (use_high_dvec2_semantic) { + /* Increment the offset when we wrap around the dual slot. */ + if (high_dvec2) + offset = nir_iadd_imm(b, offset, slot_size); + high_dvec2 = !high_dvec2; + } else { + offset = nir_iadd_imm(b, offset, slot_size); + } } return nir_vec(b, comp64, intrin->def.num_components); @@ -384,12 +413,12 @@ lower_load(nir_intrinsic_instr *intrin, struct lower_io_state *state, return nir_b2b1(&state->builder, emit_load(state, array_index, var, offset, component, intrin->def.num_components, 32, - nir_type_bool32)); + nir_type_bool32, false)); } else { return emit_load(state, array_index, var, offset, component, intrin->def.num_components, intrin->def.bit_size, - nir_get_nir_type_for_glsl_type(type)); + nir_get_nir_type_for_glsl_type(type), false); } } @@ -461,7 +490,8 @@ lower_store(nir_intrinsic_instr *intrin, struct lower_io_state *state, { const bool lower_double = !glsl_type_is_integer(type) && state->options & nir_lower_io_lower_64bit_float_to_32; if (intrin->src[1].ssa->bit_size == 64 && - (lower_double || (state->options & nir_lower_io_lower_64bit_to_32))) { + (lower_double || (state->options & (nir_lower_io_lower_64bit_to_32 | + nir_lower_io_lower_64bit_to_32_new)))) { nir_builder *b = &state->builder; const unsigned slot_size = state->type_size(glsl_dvec_type(2), false); diff --git a/src/compiler/nir/nir_print.c b/src/compiler/nir/nir_print.c index e87e8b5..301b98a 100644 --- a/src/compiler/nir/nir_print.c +++ b/src/compiler/nir/nir_print.c @@ -1378,6 +1378,9 @@ print_intrinsic_instr(nir_intrinsic_instr *instr, print_state *state) if (io.high_16bits) fprintf(fp, " high_16bits"); + if (io.high_dvec2) + fprintf(fp, " high_dvec2"); + if (io.no_varying) fprintf(fp, " no_varying"); -- 2.7.4