nir,radeonsi: move ffma fusing to late optimizations for better codegen
authorMarek Olšák <marek.olsak@amd.com>
Fri, 4 Sep 2020 09:55:25 +0000 (05:55 -0400)
committerMarge Bot <eric+marge@anholt.net>
Wed, 16 Sep 2020 02:39:02 +0000 (02:39 +0000)
The freedreno trace changes were suggested by Rob Clark.

ALU performance is higher, because ffma is used more often, but so is
register usage, because trinary opcodes (such as ffma) usually need
at least 3 live registers.

54793 shaders in 33659 tests
Totals:
SGPRS: 2639746 -> 2642938 (0.12 %)
VGPRS: 1534120 -> 1536392 (0.15 %)
Spilled SGPRs: 3541 -> 3618 (2.17 %)
Spilled VGPRs: 33 -> 44 (33.33 %)
Scratch size: 292 -> 312 (6.85 %) dwords per thread
Code Size: 55639836 -> 55620116 (-0.04 %) bytes
Max Waves: 964785 -> 963977 (-0.08 %)

Totals from affected shaders:
SGPRS: 1105800 -> 1108992 (0.29 %)
VGPRS: 635292 -> 637564 (0.36 %)
Spilled SGPRs: 3193 -> 3270 (2.41 %)
Spilled VGPRs: 33 -> 44 (33.33 %)
Scratch size: 36 -> 56 (55.56 %) dwords per thread
Code Size: 31568708 -> 31548988 (-0.06 %) bytes
Max Waves: 319991 -> 319183 (-0.25 %)

Reviewed-by: Connor Abbott <cwabbott0@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/6596>

.gitlab-ci/traces-freedreno.yml
.gitlab-ci/traces-radeonsi.yml
src/compiler/nir/nir_opt_algebraic.py
src/gallium/drivers/radeonsi/si_shader_nir.c

index 163833a..daa72ee 100644 (file)
@@ -11,12 +11,12 @@ traces:
   - path: gputest/furmark.trace
     expectations:
       - device: freedreno-a630
-        checksum: de674022e53fc9e0a9eb217f8bf0fe03
+        checksum: af6e1faf11407a7e7c416f2c532de029
 # Note: Requires GL3.3
   - path: gputest/gimark.trace
     expectations:
       - device: freedreno-a630
-        checksum: 2cae8e2104356e2b3017cbd953cf7b4a
+        checksum: 47419914b87422b267e20b6981a7eb43
   - path: gputest/pixmark-julia-fp32.trace
     expectations:
       - device: freedreno-a630
@@ -37,16 +37,16 @@ traces:
     expectations:
       # Looks fine, but totally different shape from the rendering on i965.
       - device: freedreno-a630
-        checksum: 86d678c70b8adf27095ace1a6bbfe2d2
+        checksum: 9ee5a036510be0f506705eacc1516bf3
   - path: gputest/plot3d.trace
     expectations:
       - device: freedreno-a630
-        checksum: 67a9eb692e694b11107860bbcd47d493
+        checksum: 42aba3ab943dae2fe952cae1ff91c354
 # Note: Requires GL4 for tess.
   - path: gputest/tessmark.trace
     expectations:
       - device: freedreno-a630
-        checksum: 985e231b58b7dc4b6da34ff32f8ebb82
+        checksum: 8688b3904b6b2bc591d8b669ecae4d53
   - path: gputest/triangle.trace
     expectations:
       - device: freedreno-a630
@@ -149,7 +149,7 @@ traces:
   - path: glmark2/effect2d-kernel=1,1,1,1,1;1,1,1,1,1;1,1,1,1,1;.rdc
     expectations:
       - device: freedreno-a630
-        checksum: 2346a6597f4d1f20b493e8d6a8f7e592
+        checksum: 2964d37446db126a5fe462b1ba4542cd
   - path: glmark2/function-fragment-complexity=low:fragment-steps=5.rdc
     expectations:
       # Incorrect rendering, a bunch of the area is uniform gray when it should
@@ -215,7 +215,7 @@ traces:
   - path: glmark2/shading-shading=gouraud.rdc
     expectations:
       - device: freedreno-a630
-        checksum: fcc26fca31375b216382e69bc5f113fb
+        checksum: bd9058f041bd2d59c039cccdb7d50bf7
   - path: glmark2/shading-shading=phong.rdc
     # Some speckling on the main specular highlight that may just be
     # mediump artifacts
@@ -226,11 +226,6 @@ traces:
     expectations:
       - device: freedreno-a630
         checksum: d8b5931669733240797f1acf5d98db25
-    # Very yellow terrain compared to i965, may just be mediump artifacts.
-  - path: glmark2/terrain.rdc
-    expectations:
-      - device: freedreno-a630
-        checksum: 114f7dfe97768d9c565a29f656c8f9cf
   - path: glmark2/texture-texture-filter=linear.rdc
     expectations:
       - device: freedreno-a630
index 111fc63..13b1da5 100644 (file)
@@ -33,11 +33,11 @@ traces:
   - path: gputest/furmark.trace
     expectations:
       - device: gl-radeonsi-stoney
-        checksum: 1c569668d608c644f353caa177d577c6
+        checksum: d71c0d8e6c46c8f29d1aa8d0ed7d3c87
   - path: gputest/pixmark-piano.trace
     expectations:
       - device: gl-radeonsi-stoney
-        checksum: a0e1d6358f76666603b08eab383af080
+        checksum: 777d48e82cabceef6d9489189f91d266
   - path: gputest/triangle.trace
     expectations:
       - device: gl-radeonsi-stoney
@@ -153,7 +153,7 @@ traces:
   - path: glmark2/shadow.rdc
     expectations:
       - device: gl-radeonsi-stoney
-        checksum: 4bf5ca9ce641de1031eb8125d80a3005
+        checksum: 03dfbf026a0f0ab643e5a6ef19623e81
   - path: glmark2/terrain.rdc
     expectations:
       - device: gl-radeonsi-stoney
@@ -173,7 +173,7 @@ traces:
   - path: godot/Material Testers.x86_64_2020.04.08_13.38_frame799.rdc
     expectations:
       - device: gl-radeonsi-stoney
-        checksum: 5164e238381e7d77a64e3de771cc005f
+        checksum: 990abd360dc380c95ee2645f8b402d47
   - path: gputest/gimark.trace
     expectations:
       - device: gl-radeonsi-stoney
@@ -189,15 +189,15 @@ traces:
   - path: gputest/pixmark-piano.trace
     expectations:
       - device: gl-radeonsi-stoney
-        checksum: a0e1d6358f76666603b08eab383af080
+        checksum: 777d48e82cabceef6d9489189f91d266
   - path: gputest/pixmark-volplosion.trace
     expectations:
       - device: gl-radeonsi-stoney
-        checksum: 2fba173643c014bcfa4b31eb55a514b9
+        checksum: 708f92a8ac8aef23a4a544cc5ec755d6
   - path: gputest/plot3d.trace
     expectations:
       - device: gl-radeonsi-stoney
-        checksum: fd367551aa74e2903e0590a893da01a6
+        checksum: f9e6c1cb70add69cf2a4724800d48b25
   - path: gputest/tessmark.trace
     expectations:
       - device: gl-radeonsi-stoney
@@ -229,7 +229,7 @@ traces:
   - path: supertuxkart/supertuxkart-antediluvian-abyss.rdc
     expectations:
       - device: gl-radeonsi-stoney
-        checksum: 17f4039392a65ad23133cb2cac82dba4
+        checksum: a2c4c127873f93b7db4ef48ea9fb7689
   - path: supertuxkart/supertuxkart-menu.rdc
     expectations:
       - device: gl-radeonsi-stoney
@@ -237,4 +237,4 @@ traces:
   - path: supertuxkart/supertuxkart-ravenbridge-mansion.rdc
     expectations:
       - device: gl-radeonsi-stoney
-        checksum: 46f08af5c49d711b41d4082f8a5cf6d6
+        checksum: c8f9eae92c67c7d53db4d69a703e3914
index f2ef598..39c07ce 100644 (file)
@@ -194,7 +194,8 @@ optimizations.extend([
    (('ffract', a), ('fsub', a, ('ffloor', a)), 'options->lower_ffract'),
    (('fceil', a), ('fneg', ('ffloor', ('fneg', a))), 'options->lower_fceil'),
    (('ffma', a, b, c), ('fadd', ('fmul', a, b), c), 'options->lower_ffma'),
-   (('~fadd', ('fmul', a, b), c), ('ffma', a, b, c), 'options->fuse_ffma'),
+   # Always lower inexact ffma, because it will be fused back by late optimizations (nir_opt_algebraic_late).
+   (('~ffma', a, b, c), ('fadd', ('fmul', a, b), c), 'options->fuse_ffma'),
 
    (('~fmul', ('fadd', ('iand', ('ineg', ('b2i', 'a@bool')), ('fmul', b, c)), '#d'), '#e'),
     ('bcsel', a, ('fmul', ('fadd', ('fmul', b, c), d), e), ('fmul', d, e))),
@@ -2027,6 +2028,7 @@ late_optimizations = [
    (('fneg', a), ('fsub', 0.0, a), 'options->lower_negate'),
    (('ineg', a), ('isub', 0, a), 'options->lower_negate'),
    (('iabs', a), ('imax', a, ('ineg', a)), 'options->lower_iabs'),
+   (('~fadd', ('fmul', a, b), c), ('ffma', a, b, c), 'options->fuse_ffma'),
 
    # These are duplicated from the main optimizations table.  The late
    # patterns that rearrange expressions like x - .5 < 0 to x < .5 can create
index 534973b..4b879bf 100644 (file)
@@ -698,6 +698,17 @@ static void si_lower_nir(struct si_screen *sscreen, struct nir_shader *nir)
    if (changed)
       si_nir_opts(nir, false);
 
+   /* Run late optimizations to fuse ffma. */
+   bool more_late_algebraic = true;
+   while (more_late_algebraic) {
+      more_late_algebraic = false;
+      NIR_PASS(more_late_algebraic, nir, nir_opt_algebraic_late);
+      NIR_PASS_V(nir, nir_opt_constant_folding);
+      NIR_PASS_V(nir, nir_copy_prop);
+      NIR_PASS_V(nir, nir_opt_dce);
+      NIR_PASS_V(nir, nir_opt_cse);
+   }
+
    NIR_PASS_V(nir, nir_lower_bool_to_int32);
    NIR_PASS_V(nir, nir_remove_dead_variables, nir_var_function_temp, NULL);