From: Samuel Pitoiset Date: Fri, 18 Feb 2022 13:23:42 +0000 (+0100) Subject: radv,aco,llvm: lower post shuffle vertex in NIR X-Git-Tag: upstream/22.3.5~11958 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=342e6f83321a91816358dee82178809b2a8aeeaa;p=platform%2Fupstream%2Fmesa.git radv,aco,llvm: lower post shuffle vertex in NIR fossils-db (Sienna Cichlid): Totals from 774 (0.57% of 134913) affected shaders: VGPRs: 26496 -> 26312 (-0.69%) CodeSize: 1825936 -> 1828812 (+0.16%); split: -0.04%, +0.20% MaxWaves: 22046 -> 22062 (+0.07%) Instrs: 347634 -> 347975 (+0.10%); split: -0.05%, +0.15% Latency: 1363949 -> 1356426 (-0.55%); split: -0.59%, +0.04% InvThroughput: 221529 -> 221380 (-0.07%); split: -0.10%, +0.04% VClause: 5682 -> 5676 (-0.11%); split: -1.46%, +1.36% SClause: 7485 -> 7411 (-0.99%); split: -1.48%, +0.49% Copies: 30481 -> 30420 (-0.20%); split: -0.51%, +0.31% PreVGPRs: 19717 -> 19656 (-0.31%) fossil-db (Polaris10): Totals from 896 (0.66% of 135960) affected shaders: SGPRs: 49824 -> 49648 (-0.35%); split: -0.39%, +0.03% VGPRs: 31040 -> 29948 (-3.52%); split: -3.62%, +0.10% CodeSize: 875960 -> 875920 (-0.00%); split: -0.06%, +0.05% MaxWaves: 6380 -> 6429 (+0.77%) Instrs: 171522 -> 171482 (-0.02%); split: -0.07%, +0.05% Latency: 1356082 -> 1334386 (-1.60%); split: -1.61%, +0.01% InvThroughput: 553389 -> 552957 (-0.08%); split: -0.08%, +0.00% VClause: 4317 -> 4244 (-1.69%); split: -2.41%, +0.72% SClause: 6157 -> 6139 (-0.29%); split: -0.45%, +0.16% Copies: 9340 -> 9235 (-1.12%); split: -1.24%, +0.12% PreVGPRs: 22366 -> 22116 (-1.12%) Signed-off-by: Samuel Pitoiset Reviewed-by: Rhys Perry Part-of: --- diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp index f9a3108..ec41356 100644 --- a/src/amd/compiler/aco_instruction_selection.cpp +++ b/src/amd/compiler/aco_instruction_selection.cpp @@ -5110,9 +5110,6 @@ visit_load_input(isel_context* ctx, nir_intrinsic_instr* instr) unsigned mask = nir_ssa_def_components_read(&instr->dest.ssa) << component; unsigned num_channels = MIN2(util_last_bit(mask), vtx_info->num_channels); - bool post_shuffle = ctx->options->key.vs.vertex_post_shuffle & (1 << location); - if (post_shuffle) - num_channels = MAX2(num_channels, 3); unsigned desc_index = ctx->program->info->vs.use_per_attribute_vb_descs ? location : attrib_binding; @@ -5147,12 +5144,10 @@ visit_load_input(isel_context* ctx, nir_intrinsic_instr* instr) bool direct_fetch = false; /* skip unused channels at the start */ - if (vtx_info->chan_byte_size && !post_shuffle) { + if (vtx_info->chan_byte_size) { channel_start = ffs(mask) - 1; for (unsigned i = 0; i < MIN2(channel_start, num_channels); i++) channels[i] = Temp(0, s1); - } else if (vtx_info->chan_byte_size && post_shuffle && !(mask & 0x8)) { - num_channels = 3 - (ffs(mask) - 1); } /* load channels */ @@ -5237,7 +5232,7 @@ visit_load_input(isel_context* ctx, nir_intrinsic_instr* instr) } Temp fetch_dst; - if (channel_start == 0 && fetch_bytes == dst.bytes() && !post_shuffle && !expanded && + if (channel_start == 0 && fetch_bytes == dst.bytes() && !expanded && num_channels <= 3) { direct_fetch = true; fetch_dst = dst; @@ -5274,9 +5269,6 @@ visit_load_input(isel_context* ctx, nir_intrinsic_instr* instr) bool is_float = nfmt != V_008F0C_BUF_NUM_FORMAT_UINT && nfmt != V_008F0C_BUF_NUM_FORMAT_SINT; - static const unsigned swizzle_normal[4] = {0, 1, 2, 3}; - static const unsigned swizzle_post_shuffle[4] = {2, 1, 0, 3}; - const unsigned* swizzle = post_shuffle ? swizzle_post_shuffle : swizzle_normal; unsigned num_components = instr->dest.ssa.num_components; aco_ptr vec{create_instruction( @@ -5285,8 +5277,8 @@ visit_load_input(isel_context* ctx, nir_intrinsic_instr* instr) unsigned num_temp = 0; for (unsigned i = 0; i < num_components; i++) { unsigned idx = i + component; - if (swizzle[idx] < num_channels && channels[swizzle[idx]].id()) { - Temp channel = channels[swizzle[idx]]; + if (idx < num_channels && channels[idx].id()) { + Temp channel = channels[idx]; vec->operands[i] = Operand(channel); num_temp++; diff --git a/src/amd/vulkan/radv_nir_to_llvm.c b/src/amd/vulkan/radv_nir_to_llvm.c index 558f7ec..13dcec3 100644 --- a/src/amd/vulkan/radv_nir_to_llvm.c +++ b/src/amd/vulkan/radv_nir_to_llvm.c @@ -682,11 +682,6 @@ load_vs_input(struct radv_shader_context *ctx, unsigned driver_location, LLVMTyp unsigned attrib_offset = ctx->options->key.vs.vertex_attribute_offsets[attrib_index]; unsigned attrib_stride = ctx->options->key.vs.vertex_attribute_strides[attrib_index]; - if (ctx->options->key.vs.vertex_post_shuffle & (1 << attrib_index)) { - /* Always load, at least, 3 channels for formats that need to be shuffled because X<->Z. */ - num_channels = MAX2(num_channels, 3); - } - unsigned desc_index = ctx->shader_info->vs.use_per_attribute_vb_descs ? attrib_index : attrib_binding; desc_index = util_bitcount(ctx->shader_info->vs.vb_desc_usage_mask & @@ -739,16 +734,6 @@ load_vs_input(struct radv_shader_context *ctx, unsigned driver_location, LLVMTyp ctx->ac.i32_0, ctx->ac.i32_0, num_channels, data_format, num_format, 0, true); } - if (ctx->options->key.vs.vertex_post_shuffle & (1 << attrib_index)) { - LLVMValueRef c[4]; - c[0] = ac_llvm_extract_elem(&ctx->ac, input, 2); - c[1] = ac_llvm_extract_elem(&ctx->ac, input, 1); - c[2] = ac_llvm_extract_elem(&ctx->ac, input, 0); - c[3] = ac_llvm_extract_elem(&ctx->ac, input, 3); - - input = ac_build_gather_values(&ctx->ac, c, 4); - } - input = radv_fixup_vertex_input_fetches(ctx, input, num_channels, is_float); for (unsigned chan = 0; chan < 4; chan++) { diff --git a/src/amd/vulkan/radv_pipeline.c b/src/amd/vulkan/radv_pipeline.c index 0f60386..c4a8d75 100644 --- a/src/amd/vulkan/radv_pipeline.c +++ b/src/amd/vulkan/radv_pipeline.c @@ -3760,24 +3760,46 @@ radv_lower_vs_input(nir_shader *nir, const struct radv_pipeline_key *pipeline_ke unsigned location = nir_intrinsic_base(intrin) - VERT_ATTRIB_GENERIC0; enum radv_vs_input_alpha_adjust alpha_adjust = pipeline_key->vs.vertex_alpha_adjust[location]; + bool post_shuffle = pipeline_key->vs.vertex_post_shuffle & (1 << location); - if (alpha_adjust == ALPHA_ADJUST_NONE) + if (alpha_adjust == ALPHA_ADJUST_NONE && !post_shuffle) continue; unsigned component = nir_intrinsic_component(intrin); unsigned num_components = intrin->dest.ssa.num_components; + static const unsigned swizzle_normal[4] = {0, 1, 2, 3}; + static const unsigned swizzle_post_shuffle[4] = {2, 1, 0, 3}; + const unsigned *swizzle = post_shuffle ? swizzle_post_shuffle : swizzle_normal; + b.cursor = nir_after_instr(instr); + nir_ssa_def *channels[4]; + + if (post_shuffle) { + /* Expand to load 3 components because it's shuffled like X<->Z. */ + intrin->num_components = MAX2(component + num_components, 3); + intrin->dest.ssa.num_components = intrin->num_components; + + nir_intrinsic_set_component(intrin, 0); + } + + for (uint32_t i = 0; i < num_components; i++) { + unsigned idx = i + (post_shuffle ? component : 0); - if (component + num_components == 4) { + channels[i] = nir_channel(&b, &intrin->dest.ssa, swizzle[idx]); + } + + if (alpha_adjust != ALPHA_ADJUST_NONE && component + num_components == 4) { unsigned idx = num_components - 1; - nir_ssa_def *alpha = radv_adjust_vertex_fetch_alpha( - &b, alpha_adjust, nir_channel(&b, &intrin->dest.ssa, idx)); - nir_ssa_def *new_dest = nir_vector_insert_imm(&b, &intrin->dest.ssa, alpha, idx); - nir_ssa_def_rewrite_uses_after(&intrin->dest.ssa, new_dest, - new_dest->parent_instr); - progress = true; + channels[idx] = radv_adjust_vertex_fetch_alpha(&b, alpha_adjust, channels[idx]); } + + nir_ssa_def *new_dest = nir_vec(&b, channels, num_components); + + nir_ssa_def_rewrite_uses_after(&intrin->dest.ssa, new_dest, + new_dest->parent_instr); + + progress = true; } }