From 558a6006299544ee5f77843f094015c62558f4ad Mon Sep 17 00:00:00 2001
From: Emma Anholt <emma@anholt.net>
Date: Tue, 14 Dec 2021 14:35:03 -0800
Subject: [PATCH] nir_to_tgsi: Enable fdot_replicates flag.

That's how the TGSI math opcodes work.

This lets lower_vec_to_regs coalesce the DP output into the .yzw channels,
giving an impressive shader-db win on softpipe:

total instructions in shared programs: 2929840 -> 2794036 (-4.64%)
instructions in affected programs: 1651438 -> 1515634 (-8.22%)
total temps in shared programs: 372730 -> 332744 (-10.73%)
temps in affected programs: 118151 -> 78165 (-33.84%)

and a minor one on r300:

total instructions in shared programs: 51238 -> 51149 (-0.17%)
instructions in affected programs: 2621 -> 2532 (-3.40%)
total vinst in shared programs: 15655 -> 15618 (-0.24%)
vinst in affected programs: 468 -> 431 (-7.91%)
total temps in shared programs: 9838 -> 9828 (-0.10%)
temps in affected programs: 59 -> 49 (-16.95%)

and a bigger one on i915g:
total instructions in shared programs: 398064 -> 395901 (-0.54%)
instructions in affected programs: 29271 -> 27108 (-7.39%)
total tex_indirect in shared programs: 12261 -> 12233 (-0.23%)
tex_indirect in affected programs: 98 -> 70 (-28.57%)
LOST:   0
GAINED: 5

The r300 change is less impressive because it does some backend copy-prop,
but also because intermediate storage of DPs now takes a vec4 instead of a
scalar.

Reviewed-by: Jason Ekstrand <jason@jlekstrand.net>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/14200>
---
 src/compiler/nir/nir_builder_opcodes_h.py | 5 +++++
 src/gallium/auxiliary/nir/nir_to_tgsi.c   | 4 ++++
 src/gallium/drivers/i915/i915_screen.c    | 2 ++
 src/gallium/drivers/r300/r300_screen.c    | 4 ++++
 src/gallium/drivers/softpipe/sp_screen.c  | 1 +
 5 files changed, 16 insertions(+)

diff --git a/src/compiler/nir/nir_builder_opcodes_h.py b/src/compiler/nir/nir_builder_opcodes_h.py
index 35e5ca7..7fc6af9 100644
--- a/src/compiler/nir/nir_builder_opcodes_h.py
+++ b/src/compiler/nir/nir_builder_opcodes_h.py
@@ -30,9 +30,13 @@ def src_decl_list(num_srcs):
 
 def src_list(num_srcs):
    return ', '.join('src' + str(i) for i in range(num_srcs))
+
+def needs_num_components(opcode):
+   return "replicated" in opcode.name
 %>
 
 % for name, opcode in sorted(opcodes.items()):
+% if not needs_num_components(opcode):
 static inline nir_ssa_def *
 nir_${name}(nir_builder *build, ${src_decl_list(opcode.num_inputs)})
 {
@@ -43,6 +47,7 @@ nir_${name}(nir_builder *build, ${src_decl_list(opcode.num_inputs)})
    return nir_build_alu_src_arr(build, nir_op_${name}, srcs);
 % endif
 }
+% endif
 % endfor
 
 % for name, opcode in sorted(INTR_OPCODES.items()):
diff --git a/src/gallium/auxiliary/nir/nir_to_tgsi.c b/src/gallium/auxiliary/nir/nir_to_tgsi.c
index 5ad0130..e5097d7 100644
--- a/src/gallium/auxiliary/nir/nir_to_tgsi.c
+++ b/src/gallium/auxiliary/nir/nir_to_tgsi.c
@@ -858,6 +858,9 @@ ntt_emit_alu(struct ntt_compile *c, nir_alu_instr *instr)
       [nir_op_fdot2] = { TGSI_OPCODE_DP2 },
       [nir_op_fdot3] = { TGSI_OPCODE_DP3 },
       [nir_op_fdot4] = { TGSI_OPCODE_DP4 },
+      [nir_op_fdot2_replicated] = { TGSI_OPCODE_DP2 },
+      [nir_op_fdot3_replicated] = { TGSI_OPCODE_DP3 },
+      [nir_op_fdot4_replicated] = { TGSI_OPCODE_DP4 },
       [nir_op_ffloor] = { TGSI_OPCODE_FLR, TGSI_OPCODE_DFLR },
       [nir_op_ffract] = { TGSI_OPCODE_FRC, TGSI_OPCODE_DFRAC },
       [nir_op_fceil] = { TGSI_OPCODE_CEIL, TGSI_OPCODE_DCEIL },
@@ -3191,6 +3194,7 @@ nir_to_tgsi(struct nir_shader *s,
 }
 
 static const nir_shader_compiler_options nir_to_tgsi_compiler_options = {
+   .fdot_replicates = true,
    .fuse_ffma32 = true,
    .fuse_ffma64 = true,
    .lower_extract_byte = true,
diff --git a/src/gallium/drivers/i915/i915_screen.c b/src/gallium/drivers/i915/i915_screen.c
index d2ab19c..75d57e1 100644
--- a/src/gallium/drivers/i915/i915_screen.c
+++ b/src/gallium/drivers/i915/i915_screen.c
@@ -106,6 +106,7 @@ i915_get_name(struct pipe_screen *screen)
 }
 
 static const nir_shader_compiler_options i915_compiler_options = {
+   .fdot_replicates = true,
    .fuse_ffma32 = true,
    .lower_bitops = true, /* required for !CAP_INTEGERS nir_to_tgsi */
    .lower_extract_byte = true,
@@ -122,6 +123,7 @@ static const nir_shader_compiler_options i915_compiler_options = {
 };
 
 static const struct nir_shader_compiler_options gallivm_nir_options = {
+   .fdot_replicates = true,
    .lower_bitops = true, /* required for !CAP_INTEGERS nir_to_tgsi */
    .lower_scmp = true,
    .lower_flrp32 = true,
diff --git a/src/gallium/drivers/r300/r300_screen.c b/src/gallium/drivers/r300/r300_screen.c
index 27c6835..8d4f902 100644
--- a/src/gallium/drivers/r300/r300_screen.c
+++ b/src/gallium/drivers/r300/r300_screen.c
@@ -474,6 +474,7 @@ static int r300_get_video_param(struct pipe_screen *screen,
 }
 
 static const nir_shader_compiler_options r500_vs_compiler_options = {
+   .fdot_replicates = true,
    .fuse_ffma32 = true,
    .fuse_ffma64 = true,
    .lower_bitops = true,
@@ -499,6 +500,7 @@ static const nir_shader_compiler_options r500_vs_compiler_options = {
 };
 
 static const nir_shader_compiler_options r500_fs_compiler_options = {
+   .fdot_replicates = true,
    .fuse_ffma32 = true,
    .fuse_ffma64 = true,
    .lower_bitops = true,
@@ -525,6 +527,7 @@ static const nir_shader_compiler_options r500_fs_compiler_options = {
 };
 
 static const nir_shader_compiler_options r300_vs_compiler_options = {
+   .fdot_replicates = true,
    .fuse_ffma32 = true,
    .fuse_ffma64 = true,
    .lower_bitops = true,
@@ -549,6 +552,7 @@ static const nir_shader_compiler_options r300_vs_compiler_options = {
 };
 
 static const nir_shader_compiler_options r300_fs_compiler_options = {
+   .fdot_replicates = true,
    .fuse_ffma32 = true,
    .fuse_ffma64 = true,
    .lower_bitops = true,
diff --git a/src/gallium/drivers/softpipe/sp_screen.c b/src/gallium/drivers/softpipe/sp_screen.c
index c87d406..4984f60 100644
--- a/src/gallium/drivers/softpipe/sp_screen.c
+++ b/src/gallium/drivers/softpipe/sp_screen.c
@@ -75,6 +75,7 @@ softpipe_get_name(struct pipe_screen *screen)
 }
 
 static const nir_shader_compiler_options sp_compiler_options = {
+   .fdot_replicates = true,
    .fuse_ffma32 = true,
    .fuse_ffma64 = true,
    .lower_extract_byte = true,
-- 
2.7.4