From 9c9e8c334981b1af7a709fa42cd5ef9dcf4d9791 Mon Sep 17 00:00:00 2001
From: Connor Abbott <cwabbott0@gmail.com>
Date: Fri, 14 Jan 2022 13:49:44 +0100
Subject: [PATCH] nir: Reorder ffma and fsub combining
MIME-Version: 1.0
Content-Type: text/plain; charset=utf8
Content-Transfer-Encoding: 8bit

It's relatively common to do something like "a * b - c", which on most
GPUs can be implemented in a single instruction. Before
opt_algebraic_late this will be something like
"fadd(fmul(a, b), fneg(c))", and we want to turn it info
"ffma(a, b, fneg(c))". But because the fsub pattern was first we
instead turned it into "fsub(fmul(a, b), c)". Fix this by reordering
them.

Selected shader-db results on freedreno:

total instructions in shared programs: 1561330 -> 1551619 (-0.62%)
instructions in affected programs: 780272 -> 770561 (-1.24%)
helped: 1941
HURT: 491
helped stats (abs) min: 1 max: 147 xÌ: 7.98 xÌ: 4
helped stats (rel) min: 0.07% max: 30.77% xÌ: 4.36% xÌ: 3.17%
HURT stats (abs)   min: 1 max: 307 xÌ: 11.76 xÌ: 5
HURT stats (rel)   min: 0.09% max: 18.71% xÌ: 2.26% xÌ: 1.38%
95% mean confidence interval for instructions value: -4.57 -3.41
95% mean confidence interval for instructions %-change: -3.21% -2.84%
Instructions are helped.

total nops in shared programs: 358926 -> 356263 (-0.74%)
nops in affected programs: 167116 -> 164453 (-1.59%)
helped: 1395
HURT: 859
helped stats (abs) min: 1 max: 108 xÌ: 6.80 xÌ: 3
helped stats (rel) min: 0.17% max: 100.00% xÌ: 19.15% xÌ: 10.57%
HURT stats (abs)   min: 1 max: 307 xÌ: 7.95 xÌ: 3
HURT stats (rel)   min: 0.00% max: 381.82% xÌ: 20.04% xÌ: 10.00%
95% mean confidence interval for nops value: -1.77 -0.59
95% mean confidence interval for nops %-change: -5.55% -2.87%
Nops are helped.

total non-nops in shared programs: 1202404 -> 1195356 (-0.59%)
non-nops in affected programs: 496682 -> 489634 (-1.42%)
helped: 1951
HURT: 265
helped stats (abs) min: 1 max: 39 xÌ: 4.02 xÌ: 3
helped stats (rel) min: 0.07% max: 15.38% xÌ: 2.97% xÌ: 1.96%
HURT stats (abs)   min: 1 max: 22 xÌ: 2.97 xÌ: 2
HURT stats (rel)   min: 0.05% max: 10.00% xÌ: 1.14% xÌ: 0.75%
95% mean confidence interval for non-nops value: -3.38 -2.99
95% mean confidence interval for non-nops %-change: -2.60% -2.36%
Non-nops are helped.

total systall in shared programs: 288317 -> 292975 (1.62%)
systall in affected programs: 87876 -> 92534 (5.30%)
helped: 388
HURT: 431
helped stats (abs) min: 1 max: 214 xÌ: 14.39 xÌ: 8
helped stats (rel) min: 0.25% max: 100.00% xÌ: 22.12% xÌ: 11.96%
HURT stats (abs)   min: 1 max: 232 xÌ: 23.77 xÌ: 7
HURT stats (rel)   min: 0.00% max: 1300.00% xÌ: 51.71% xÌ: 17.30%
95% mean confidence interval for systall value: 3.07 8.30
95% mean confidence interval for systall %-change: 9.49% 23.97%
Systall are HURT.

(The systall hurt is probably just due to having having fewer
instructions to hide latency with.)

Reviewed-by: Alyssa Rosenzweig <alyssa@rosenzweig.io>
Acked-by: Daniel SchÃ¼rmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/14554>
---
 src/compiler/nir/nir_opt_algebraic.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/src/compiler/nir/nir_opt_algebraic.py b/src/compiler/nir/nir_opt_algebraic.py
index 2a6d307..d5a9f72 100644
--- a/src/compiler/nir/nir_opt_algebraic.py
+++ b/src/compiler/nir/nir_opt_algebraic.py
@@ -2478,15 +2478,18 @@ late_optimizations = [
    # optimization loop can prevent other optimizations.
    (('fneg', ('fneg', a)), a),
 
+   # re-combine inexact mul+add to ffma. Do this before fsub so that a * b - c
+   # gets combined to fma(a, b, -c).
+   (('~fadd@16', ('fmul', a, b), c), ('ffma', a, b, c), 'options->fuse_ffma16'),
+   (('~fadd@32', ('fmul', a, b), c), ('ffma', a, b, c), 'options->fuse_ffma32'),
+   (('~fadd@64', ('fmul', a, b), c), ('ffma', a, b, c), 'options->fuse_ffma64'),
+
    # Subtractions get lowered during optimization, so we need to recombine them
    (('fadd', a, ('fneg', 'b')), ('fsub', 'a', 'b'), 'options->has_fsub'),
    (('fneg', a), ('fmul', a, -1.0), 'options->lower_fneg'),
    (('iadd', a, ('ineg', 'b')), ('isub', 'a', 'b'), 'options->has_isub || options->lower_ineg'),
    (('ineg', a), ('isub', 0, a), 'options->lower_ineg'),
    (('iabs', a), ('imax', a, ('ineg', a)), 'options->lower_iabs'),
-   (('~fadd@16', ('fmul', a, b), c), ('ffma', a, b, c), 'options->fuse_ffma16'),
-   (('~fadd@32', ('fmul', a, b), c), ('ffma', a, b, c), 'options->fuse_ffma32'),
-   (('~fadd@64', ('fmul', a, b), c), ('ffma', a, b, c), 'options->fuse_ffma64'),
 
    (('iadd', ('iadd(is_used_once)', 'a(is_not_const)', 'b(is_not_const)'), 'c(is_not_const)'), ('iadd3', a, b, c), 'options->has_iadd3'),
    (('iadd', ('isub(is_used_once)', 'a(is_not_const)', 'b(is_not_const)'), 'c(is_not_const)'), ('iadd3', a, ('ineg', b), c), 'options->has_iadd3'),
-- 
2.7.4