From 558a6006299544ee5f77843f094015c62558f4ad Mon Sep 17 00:00:00 2001 From: Emma Anholt Date: Tue, 14 Dec 2021 14:35:03 -0800 Subject: [PATCH] nir_to_tgsi: Enable fdot_replicates flag. That's how the TGSI math opcodes work. This lets lower_vec_to_regs coalesce the DP output into the .yzw channels, giving an impressive shader-db win on softpipe: total instructions in shared programs: 2929840 -> 2794036 (-4.64%) instructions in affected programs: 1651438 -> 1515634 (-8.22%) total temps in shared programs: 372730 -> 332744 (-10.73%) temps in affected programs: 118151 -> 78165 (-33.84%) and a minor one on r300: total instructions in shared programs: 51238 -> 51149 (-0.17%) instructions in affected programs: 2621 -> 2532 (-3.40%) total vinst in shared programs: 15655 -> 15618 (-0.24%) vinst in affected programs: 468 -> 431 (-7.91%) total temps in shared programs: 9838 -> 9828 (-0.10%) temps in affected programs: 59 -> 49 (-16.95%) and a bigger one on i915g: total instructions in shared programs: 398064 -> 395901 (-0.54%) instructions in affected programs: 29271 -> 27108 (-7.39%) total tex_indirect in shared programs: 12261 -> 12233 (-0.23%) tex_indirect in affected programs: 98 -> 70 (-28.57%) LOST: 0 GAINED: 5 The r300 change is less impressive because it does some backend copy-prop, but also because intermediate storage of DPs now takes a vec4 instead of a scalar. Reviewed-by: Jason Ekstrand Part-of: --- src/compiler/nir/nir_builder_opcodes_h.py | 5 +++++ src/gallium/auxiliary/nir/nir_to_tgsi.c | 4 ++++ src/gallium/drivers/i915/i915_screen.c | 2 ++ src/gallium/drivers/r300/r300_screen.c | 4 ++++ src/gallium/drivers/softpipe/sp_screen.c | 1 + 5 files changed, 16 insertions(+) diff --git a/src/compiler/nir/nir_builder_opcodes_h.py b/src/compiler/nir/nir_builder_opcodes_h.py index 35e5ca7..7fc6af9 100644 --- a/src/compiler/nir/nir_builder_opcodes_h.py +++ b/src/compiler/nir/nir_builder_opcodes_h.py @@ -30,9 +30,13 @@ def src_decl_list(num_srcs): def src_list(num_srcs): return ', '.join('src' + str(i) for i in range(num_srcs)) + +def needs_num_components(opcode): + return "replicated" in opcode.name %> % for name, opcode in sorted(opcodes.items()): +% if not needs_num_components(opcode): static inline nir_ssa_def * nir_${name}(nir_builder *build, ${src_decl_list(opcode.num_inputs)}) { @@ -43,6 +47,7 @@ nir_${name}(nir_builder *build, ${src_decl_list(opcode.num_inputs)}) return nir_build_alu_src_arr(build, nir_op_${name}, srcs); % endif } +% endif % endfor % for name, opcode in sorted(INTR_OPCODES.items()): diff --git a/src/gallium/auxiliary/nir/nir_to_tgsi.c b/src/gallium/auxiliary/nir/nir_to_tgsi.c index 5ad0130..e5097d7 100644 --- a/src/gallium/auxiliary/nir/nir_to_tgsi.c +++ b/src/gallium/auxiliary/nir/nir_to_tgsi.c @@ -858,6 +858,9 @@ ntt_emit_alu(struct ntt_compile *c, nir_alu_instr *instr) [nir_op_fdot2] = { TGSI_OPCODE_DP2 }, [nir_op_fdot3] = { TGSI_OPCODE_DP3 }, [nir_op_fdot4] = { TGSI_OPCODE_DP4 }, + [nir_op_fdot2_replicated] = { TGSI_OPCODE_DP2 }, + [nir_op_fdot3_replicated] = { TGSI_OPCODE_DP3 }, + [nir_op_fdot4_replicated] = { TGSI_OPCODE_DP4 }, [nir_op_ffloor] = { TGSI_OPCODE_FLR, TGSI_OPCODE_DFLR }, [nir_op_ffract] = { TGSI_OPCODE_FRC, TGSI_OPCODE_DFRAC }, [nir_op_fceil] = { TGSI_OPCODE_CEIL, TGSI_OPCODE_DCEIL }, @@ -3191,6 +3194,7 @@ nir_to_tgsi(struct nir_shader *s, } static const nir_shader_compiler_options nir_to_tgsi_compiler_options = { + .fdot_replicates = true, .fuse_ffma32 = true, .fuse_ffma64 = true, .lower_extract_byte = true, diff --git a/src/gallium/drivers/i915/i915_screen.c b/src/gallium/drivers/i915/i915_screen.c index d2ab19c..75d57e1 100644 --- a/src/gallium/drivers/i915/i915_screen.c +++ b/src/gallium/drivers/i915/i915_screen.c @@ -106,6 +106,7 @@ i915_get_name(struct pipe_screen *screen) } static const nir_shader_compiler_options i915_compiler_options = { + .fdot_replicates = true, .fuse_ffma32 = true, .lower_bitops = true, /* required for !CAP_INTEGERS nir_to_tgsi */ .lower_extract_byte = true, @@ -122,6 +123,7 @@ static const nir_shader_compiler_options i915_compiler_options = { }; static const struct nir_shader_compiler_options gallivm_nir_options = { + .fdot_replicates = true, .lower_bitops = true, /* required for !CAP_INTEGERS nir_to_tgsi */ .lower_scmp = true, .lower_flrp32 = true, diff --git a/src/gallium/drivers/r300/r300_screen.c b/src/gallium/drivers/r300/r300_screen.c index 27c6835..8d4f902 100644 --- a/src/gallium/drivers/r300/r300_screen.c +++ b/src/gallium/drivers/r300/r300_screen.c @@ -474,6 +474,7 @@ static int r300_get_video_param(struct pipe_screen *screen, } static const nir_shader_compiler_options r500_vs_compiler_options = { + .fdot_replicates = true, .fuse_ffma32 = true, .fuse_ffma64 = true, .lower_bitops = true, @@ -499,6 +500,7 @@ static const nir_shader_compiler_options r500_vs_compiler_options = { }; static const nir_shader_compiler_options r500_fs_compiler_options = { + .fdot_replicates = true, .fuse_ffma32 = true, .fuse_ffma64 = true, .lower_bitops = true, @@ -525,6 +527,7 @@ static const nir_shader_compiler_options r500_fs_compiler_options = { }; static const nir_shader_compiler_options r300_vs_compiler_options = { + .fdot_replicates = true, .fuse_ffma32 = true, .fuse_ffma64 = true, .lower_bitops = true, @@ -549,6 +552,7 @@ static const nir_shader_compiler_options r300_vs_compiler_options = { }; static const nir_shader_compiler_options r300_fs_compiler_options = { + .fdot_replicates = true, .fuse_ffma32 = true, .fuse_ffma64 = true, .lower_bitops = true, diff --git a/src/gallium/drivers/softpipe/sp_screen.c b/src/gallium/drivers/softpipe/sp_screen.c index c87d406..4984f60 100644 --- a/src/gallium/drivers/softpipe/sp_screen.c +++ b/src/gallium/drivers/softpipe/sp_screen.c @@ -75,6 +75,7 @@ softpipe_get_name(struct pipe_screen *screen) } static const nir_shader_compiler_options sp_compiler_options = { + .fdot_replicates = true, .fuse_ffma32 = true, .fuse_ffma64 = true, .lower_extract_byte = true, -- 2.7.4