From 3668da7c83bf52f639313e9527878f9bd27b4a1c Mon Sep 17 00:00:00 2001 From: Rhys Perry Date: Fri, 20 Jan 2023 11:32:17 +0000 Subject: [PATCH] nir: use xyzw order for precise fdot MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Fixes flickering grass in Immortals Fenyx Rising. fossil-db (gfx1100): Totals from 13969 (10.38% of 134574) affected shaders: MaxWaves: 442794 -> 442878 (+0.02%) Instrs: 4861105 -> 4901408 (+0.83%); split: -0.02%, +0.85% CodeSize: 24316100 -> 24396272 (+0.33%); split: -0.03%, +0.35% VGPRs: 446256 -> 445572 (-0.15%); split: -0.20%, +0.05% Latency: 28122456 -> 28162233 (+0.14%); split: -0.10%, +0.24% InvThroughput: 2899673 -> 2904323 (+0.16%); split: -0.07%, +0.23% VClause: 119599 -> 119631 (+0.03%); split: -0.07%, +0.09% SClause: 186636 -> 186265 (-0.20%); split: -0.23%, +0.03% Copies: 301370 -> 300386 (-0.33%); split: -0.75%, +0.42% Branches: 85066 -> 85047 (-0.02%); split: -0.02%, +0.00% PreSGPRs: 436167 -> 436137 (-0.01%) PreVGPRs: 329715 -> 329809 (+0.03%); split: -0.01%, +0.04% fossil-db (gfx1100, RADV_DEBUG=invariantgeom): Totals from 43116 (32.04% of 134574) affected shaders: MaxWaves: 1332938 -> 1333012 (+0.01%); split: +0.01%, -0.00% Instrs: 16424513 -> 16658021 (+1.42%); split: -0.06%, +1.48% CodeSize: 81258868 -> 81827860 (+0.70%); split: -0.07%, +0.77% VGPRs: 1720368 -> 1719648 (-0.04%); split: -0.19%, +0.15% SpillSGPRs: 1670 -> 1600 (-4.19%); split: -5.27%, +1.08% Latency: 82063766 -> 82425418 (+0.44%); split: -0.23%, +0.67% InvThroughput: 9665803 -> 9727810 (+0.64%); split: -0.09%, +0.73% VClause: 449662 -> 451099 (+0.32%); split: -0.32%, +0.64% SClause: 498841 -> 498639 (-0.04%); split: -0.24%, +0.20% Copies: 1001020 -> 1000770 (-0.02%); split: -1.20%, +1.17% Branches: 237580 -> 239637 (+0.87%); split: -0.01%, +0.88% PreSGPRs: 1198167 -> 1198024 (-0.01%); split: -0.01%, +0.00% PreVGPRs: 1225202 -> 1225035 (-0.01%); split: -0.06%, +0.05% fossil-db (navi10): Totals from 13969 (10.38% of 134563) affected shaders: MaxWaves: 474386 -> 474508 (+0.03%); split: +0.05%, -0.03% Instrs: 3740895 -> 3771566 (+0.82%); split: -0.00%, +0.82% CodeSize: 19426592 -> 19459916 (+0.17%); split: -0.00%, +0.18% VGPRs: 389916 -> 389852 (-0.02%); split: -0.09%, +0.07% Latency: 25452927 -> 25502482 (+0.19%); split: -0.14%, +0.34% InvThroughput: 3880807 -> 3923144 (+1.09%); split: -0.07%, +1.16% VClause: 66835 -> 66712 (-0.18%); split: -0.38%, +0.20% SClause: 178805 -> 178802 (-0.00%); split: -0.01%, +0.01% Copies: 167601 -> 167625 (+0.01%); split: -0.54%, +0.56% Branches: 83788 -> 83784 (-0.00%) PreSGPRs: 388229 -> 388216 (-0.00%) PreVGPRs: 342984 -> 343062 (+0.02%); split: -0.01%, +0.03% fossil-db (navi10, RADV_DEBUG=invariantgeom): Totals from 43116 (32.04% of 134563) affected shaders: MaxWaves: 1260184 -> 1256414 (-0.30%); split: +0.10%, -0.40% Instrs: 12804951 -> 12983628 (+1.40%); split: -0.01%, +1.41% CodeSize: 65813224 -> 66137852 (+0.49%); split: -0.03%, +0.52% VGPRs: 1556396 -> 1561340 (+0.32%); split: -0.09%, +0.41% SpillSGPRs: 1377 -> 1395 (+1.31%) Latency: 76095867 -> 76355111 (+0.34%); split: -0.32%, +0.66% InvThroughput: 13546863 -> 13788789 (+1.79%); split: -0.05%, +1.84% VClause: 310910 -> 311283 (+0.12%); split: -0.63%, +0.75% SClause: 474878 -> 474941 (+0.01%); split: -0.09%, +0.10% Copies: 639367 -> 637610 (-0.27%); split: -1.03%, +0.76% Branches: 240178 -> 240185 (+0.00%); split: -0.00%, +0.00% PreSGPRs: 1056594 -> 1056590 (-0.00%); split: -0.00%, +0.00% PreVGPRs: 1247950 -> 1247798 (-0.01%); split: -0.05%, +0.04% Signed-off-by: Rhys Perry Reviewed-by: Timur Kristóf Closes: https://gitlab.freedesktop.org/mesa/mesa/-/issues/7920 Cc: mesa-stable Part-of: --- src/compiler/nir/nir_lower_alu_width.c | 28 ++++++++++++++++++---------- 1 file changed, 18 insertions(+), 10 deletions(-) diff --git a/src/compiler/nir/nir_lower_alu_width.c b/src/compiler/nir/nir_lower_alu_width.c index 9b1b3d0..05a4871 100644 --- a/src/compiler/nir/nir_lower_alu_width.c +++ b/src/compiler/nir/nir_lower_alu_width.c @@ -98,26 +98,27 @@ nir_alu_ssa_dest_init(nir_alu_instr *alu, unsigned num_components, static nir_ssa_def * lower_reduction(nir_alu_instr *alu, nir_op chan_op, nir_op merge_op, - nir_builder *builder) + nir_builder *builder, bool reverse_order) { unsigned num_components = nir_op_infos[alu->op].input_sizes[0]; nir_ssa_def *last = NULL; - for (int i = num_components - 1; i >= 0; i--) { + for (int i = 0; i < num_components; i++) { + int channel = reverse_order ? num_components - 1 - i : i; nir_alu_instr *chan = nir_alu_instr_create(builder->shader, chan_op); nir_alu_ssa_dest_init(chan, 1, alu->dest.dest.ssa.bit_size); nir_alu_src_copy(&chan->src[0], &alu->src[0], chan); - chan->src[0].swizzle[0] = chan->src[0].swizzle[i]; + chan->src[0].swizzle[0] = chan->src[0].swizzle[channel]; if (nir_op_infos[chan_op].num_inputs > 1) { assert(nir_op_infos[chan_op].num_inputs == 2); nir_alu_src_copy(&chan->src[1], &alu->src[1], chan); - chan->src[1].swizzle[0] = chan->src[1].swizzle[i]; + chan->src[1].swizzle[0] = chan->src[1].swizzle[channel]; } chan->exact = alu->exact; nir_builder_instr_insert(builder, &chan->instr); - if (i == num_components - 1) { + if (i == 0) { last = &chan->dest.dest.ssa; } else { last = nir_build_alu(builder, merge_op, @@ -145,24 +146,31 @@ will_lower_ffma(nir_shader *shader, unsigned bit_size) static nir_ssa_def * lower_fdot(nir_alu_instr *alu, nir_builder *builder) { + /* Reversed order can result in lower instruction count because it + * creates more MAD/FMA in the case of fdot(a, vec4(b, 1.0)). + * Some games expect xyzw order, so only reverse the order for imprecise fdot. + */ + bool reverse_order = !builder->exact; + /* If we don't want to lower ffma, create several ffma instead of fmul+fadd * and fusing later because fusing is not possible for exact fdot instructions. */ if (will_lower_ffma(builder->shader, alu->dest.dest.ssa.bit_size)) - return lower_reduction(alu, nir_op_fmul, nir_op_fadd, builder); + return lower_reduction(alu, nir_op_fmul, nir_op_fadd, builder, reverse_order); unsigned num_components = nir_op_infos[alu->op].input_sizes[0]; nir_ssa_def *prev = NULL; - for (int i = num_components - 1; i >= 0; i--) { + for (int i = 0; i < num_components; i++) { + int channel = reverse_order ? num_components - 1 - i : i; nir_alu_instr *instr = nir_alu_instr_create( builder->shader, prev ? nir_op_ffma : nir_op_fmul); nir_alu_ssa_dest_init(instr, 1, alu->dest.dest.ssa.bit_size); for (unsigned j = 0; j < 2; j++) { nir_alu_src_copy(&instr->src[j], &alu->src[j], instr); - instr->src[j].swizzle[0] = alu->src[j].swizzle[i]; + instr->src[j].swizzle[0] = alu->src[j].swizzle[channel]; } - if (i != num_components - 1) + if (i != 0) instr->src[2].src = nir_src_for_ssa(prev); instr->exact = builder->exact; @@ -203,7 +211,7 @@ lower_alu_instr_width(nir_builder *b, nir_instr *instr, void *_data) case name##4: \ case name##8: \ case name##16: \ - return lower_reduction(alu, chan, merge, b); \ + return lower_reduction(alu, chan, merge, b, true); \ switch (alu->op) { case nir_op_vec16: -- 2.7.4