From 9c9e8c334981b1af7a709fa42cd5ef9dcf4d9791 Mon Sep 17 00:00:00 2001 From: Connor Abbott Date: Fri, 14 Jan 2022 13:49:44 +0100 Subject: [PATCH] nir: Reorder ffma and fsub combining MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit It's relatively common to do something like "a * b - c", which on most GPUs can be implemented in a single instruction. Before opt_algebraic_late this will be something like "fadd(fmul(a, b), fneg(c))", and we want to turn it info "ffma(a, b, fneg(c))". But because the fsub pattern was first we instead turned it into "fsub(fmul(a, b), c)". Fix this by reordering them. Selected shader-db results on freedreno: total instructions in shared programs: 1561330 -> 1551619 (-0.62%) instructions in affected programs: 780272 -> 770561 (-1.24%) helped: 1941 HURT: 491 helped stats (abs) min: 1 max: 147 x̄: 7.98 x̃: 4 helped stats (rel) min: 0.07% max: 30.77% x̄: 4.36% x̃: 3.17% HURT stats (abs) min: 1 max: 307 x̄: 11.76 x̃: 5 HURT stats (rel) min: 0.09% max: 18.71% x̄: 2.26% x̃: 1.38% 95% mean confidence interval for instructions value: -4.57 -3.41 95% mean confidence interval for instructions %-change: -3.21% -2.84% Instructions are helped. total nops in shared programs: 358926 -> 356263 (-0.74%) nops in affected programs: 167116 -> 164453 (-1.59%) helped: 1395 HURT: 859 helped stats (abs) min: 1 max: 108 x̄: 6.80 x̃: 3 helped stats (rel) min: 0.17% max: 100.00% x̄: 19.15% x̃: 10.57% HURT stats (abs) min: 1 max: 307 x̄: 7.95 x̃: 3 HURT stats (rel) min: 0.00% max: 381.82% x̄: 20.04% x̃: 10.00% 95% mean confidence interval for nops value: -1.77 -0.59 95% mean confidence interval for nops %-change: -5.55% -2.87% Nops are helped. total non-nops in shared programs: 1202404 -> 1195356 (-0.59%) non-nops in affected programs: 496682 -> 489634 (-1.42%) helped: 1951 HURT: 265 helped stats (abs) min: 1 max: 39 x̄: 4.02 x̃: 3 helped stats (rel) min: 0.07% max: 15.38% x̄: 2.97% x̃: 1.96% HURT stats (abs) min: 1 max: 22 x̄: 2.97 x̃: 2 HURT stats (rel) min: 0.05% max: 10.00% x̄: 1.14% x̃: 0.75% 95% mean confidence interval for non-nops value: -3.38 -2.99 95% mean confidence interval for non-nops %-change: -2.60% -2.36% Non-nops are helped. total systall in shared programs: 288317 -> 292975 (1.62%) systall in affected programs: 87876 -> 92534 (5.30%) helped: 388 HURT: 431 helped stats (abs) min: 1 max: 214 x̄: 14.39 x̃: 8 helped stats (rel) min: 0.25% max: 100.00% x̄: 22.12% x̃: 11.96% HURT stats (abs) min: 1 max: 232 x̄: 23.77 x̃: 7 HURT stats (rel) min: 0.00% max: 1300.00% x̄: 51.71% x̃: 17.30% 95% mean confidence interval for systall value: 3.07 8.30 95% mean confidence interval for systall %-change: 9.49% 23.97% Systall are HURT. (The systall hurt is probably just due to having having fewer instructions to hide latency with.) Reviewed-by: Alyssa Rosenzweig Acked-by: Daniel Schürmann Part-of: --- src/compiler/nir/nir_opt_algebraic.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/compiler/nir/nir_opt_algebraic.py b/src/compiler/nir/nir_opt_algebraic.py index 2a6d307..d5a9f72 100644 --- a/src/compiler/nir/nir_opt_algebraic.py +++ b/src/compiler/nir/nir_opt_algebraic.py @@ -2478,15 +2478,18 @@ late_optimizations = [ # optimization loop can prevent other optimizations. (('fneg', ('fneg', a)), a), + # re-combine inexact mul+add to ffma. Do this before fsub so that a * b - c + # gets combined to fma(a, b, -c). + (('~fadd@16', ('fmul', a, b), c), ('ffma', a, b, c), 'options->fuse_ffma16'), + (('~fadd@32', ('fmul', a, b), c), ('ffma', a, b, c), 'options->fuse_ffma32'), + (('~fadd@64', ('fmul', a, b), c), ('ffma', a, b, c), 'options->fuse_ffma64'), + # Subtractions get lowered during optimization, so we need to recombine them (('fadd', a, ('fneg', 'b')), ('fsub', 'a', 'b'), 'options->has_fsub'), (('fneg', a), ('fmul', a, -1.0), 'options->lower_fneg'), (('iadd', a, ('ineg', 'b')), ('isub', 'a', 'b'), 'options->has_isub || options->lower_ineg'), (('ineg', a), ('isub', 0, a), 'options->lower_ineg'), (('iabs', a), ('imax', a, ('ineg', a)), 'options->lower_iabs'), - (('~fadd@16', ('fmul', a, b), c), ('ffma', a, b, c), 'options->fuse_ffma16'), - (('~fadd@32', ('fmul', a, b), c), ('ffma', a, b, c), 'options->fuse_ffma32'), - (('~fadd@64', ('fmul', a, b), c), ('ffma', a, b, c), 'options->fuse_ffma64'), (('iadd', ('iadd(is_used_once)', 'a(is_not_const)', 'b(is_not_const)'), 'c(is_not_const)'), ('iadd3', a, b, c), 'options->has_iadd3'), (('iadd', ('isub(is_used_once)', 'a(is_not_const)', 'b(is_not_const)'), 'c(is_not_const)'), ('iadd3', a, ('ineg', b), c), 'options->has_iadd3'), -- 2.7.4