The freedreno trace changes were suggested by Rob Clark.
ALU performance is higher, because ffma is used more often, but so is
register usage, because trinary opcodes (such as ffma) usually need
at least 3 live registers.
54793 shaders in 33659 tests
Totals:
SGPRS: 2639746 -> 2642938 (0.12 %)
VGPRS: 1534120 -> 1536392 (0.15 %)
Spilled SGPRs: 3541 -> 3618 (2.17 %)
Spilled VGPRs: 33 -> 44 (33.33 %)
Scratch size: 292 -> 312 (6.85 %) dwords per thread
Code Size:
55639836 ->
55620116 (-0.04 %) bytes
Max Waves: 964785 -> 963977 (-0.08 %)
Totals from affected shaders:
SGPRS: 1105800 -> 1108992 (0.29 %)
VGPRS: 635292 -> 637564 (0.36 %)
Spilled SGPRs: 3193 -> 3270 (2.41 %)
Spilled VGPRs: 33 -> 44 (33.33 %)
Scratch size: 36 -> 56 (55.56 %) dwords per thread
Code Size:
31568708 ->
31548988 (-0.06 %) bytes
Max Waves: 319991 -> 319183 (-0.25 %)
Reviewed-by: Connor Abbott <cwabbott0@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/6596>
- path: gputest/furmark.trace
expectations:
- device: freedreno-a630
- checksum: de674022e53fc9e0a9eb217f8bf0fe03
+ checksum: af6e1faf11407a7e7c416f2c532de029
# Note: Requires GL3.3
- path: gputest/gimark.trace
expectations:
- device: freedreno-a630
- checksum: 2cae8e2104356e2b3017cbd953cf7b4a
+ checksum: 47419914b87422b267e20b6981a7eb43
- path: gputest/pixmark-julia-fp32.trace
expectations:
- device: freedreno-a630
expectations:
# Looks fine, but totally different shape from the rendering on i965.
- device: freedreno-a630
- checksum: 86d678c70b8adf27095ace1a6bbfe2d2
+ checksum: 9ee5a036510be0f506705eacc1516bf3
- path: gputest/plot3d.trace
expectations:
- device: freedreno-a630
- checksum: 67a9eb692e694b11107860bbcd47d493
+ checksum: 42aba3ab943dae2fe952cae1ff91c354
# Note: Requires GL4 for tess.
- path: gputest/tessmark.trace
expectations:
- device: freedreno-a630
- checksum: 985e231b58b7dc4b6da34ff32f8ebb82
+ checksum: 8688b3904b6b2bc591d8b669ecae4d53
- path: gputest/triangle.trace
expectations:
- device: freedreno-a630
- path: glmark2/effect2d-kernel=1,1,1,1,1;1,1,1,1,1;1,1,1,1,1;.rdc
expectations:
- device: freedreno-a630
- checksum: 2346a6597f4d1f20b493e8d6a8f7e592
+ checksum: 2964d37446db126a5fe462b1ba4542cd
- path: glmark2/function-fragment-complexity=low:fragment-steps=5.rdc
expectations:
# Incorrect rendering, a bunch of the area is uniform gray when it should
- path: glmark2/shading-shading=gouraud.rdc
expectations:
- device: freedreno-a630
- checksum: fcc26fca31375b216382e69bc5f113fb
+ checksum: bd9058f041bd2d59c039cccdb7d50bf7
- path: glmark2/shading-shading=phong.rdc
# Some speckling on the main specular highlight that may just be
# mediump artifacts
expectations:
- device: freedreno-a630
checksum: d8b5931669733240797f1acf5d98db25
- # Very yellow terrain compared to i965, may just be mediump artifacts.
- - path: glmark2/terrain.rdc
- expectations:
- - device: freedreno-a630
- checksum: 114f7dfe97768d9c565a29f656c8f9cf
- path: glmark2/texture-texture-filter=linear.rdc
expectations:
- device: freedreno-a630
- path: gputest/furmark.trace
expectations:
- device: gl-radeonsi-stoney
- checksum: 1c569668d608c644f353caa177d577c6
+ checksum: d71c0d8e6c46c8f29d1aa8d0ed7d3c87
- path: gputest/pixmark-piano.trace
expectations:
- device: gl-radeonsi-stoney
- checksum: a0e1d6358f76666603b08eab383af080
+ checksum: 777d48e82cabceef6d9489189f91d266
- path: gputest/triangle.trace
expectations:
- device: gl-radeonsi-stoney
- path: glmark2/shadow.rdc
expectations:
- device: gl-radeonsi-stoney
- checksum: 4bf5ca9ce641de1031eb8125d80a3005
+ checksum: 03dfbf026a0f0ab643e5a6ef19623e81
- path: glmark2/terrain.rdc
expectations:
- device: gl-radeonsi-stoney
- path: godot/Material Testers.x86_64_2020.04.08_13.38_frame799.rdc
expectations:
- device: gl-radeonsi-stoney
- checksum: 5164e238381e7d77a64e3de771cc005f
+ checksum: 990abd360dc380c95ee2645f8b402d47
- path: gputest/gimark.trace
expectations:
- device: gl-radeonsi-stoney
- path: gputest/pixmark-piano.trace
expectations:
- device: gl-radeonsi-stoney
- checksum: a0e1d6358f76666603b08eab383af080
+ checksum: 777d48e82cabceef6d9489189f91d266
- path: gputest/pixmark-volplosion.trace
expectations:
- device: gl-radeonsi-stoney
- checksum: 2fba173643c014bcfa4b31eb55a514b9
+ checksum: 708f92a8ac8aef23a4a544cc5ec755d6
- path: gputest/plot3d.trace
expectations:
- device: gl-radeonsi-stoney
- checksum: fd367551aa74e2903e0590a893da01a6
+ checksum: f9e6c1cb70add69cf2a4724800d48b25
- path: gputest/tessmark.trace
expectations:
- device: gl-radeonsi-stoney
- path: supertuxkart/supertuxkart-antediluvian-abyss.rdc
expectations:
- device: gl-radeonsi-stoney
- checksum: 17f4039392a65ad23133cb2cac82dba4
+ checksum: a2c4c127873f93b7db4ef48ea9fb7689
- path: supertuxkart/supertuxkart-menu.rdc
expectations:
- device: gl-radeonsi-stoney
- path: supertuxkart/supertuxkart-ravenbridge-mansion.rdc
expectations:
- device: gl-radeonsi-stoney
- checksum: 46f08af5c49d711b41d4082f8a5cf6d6
+ checksum: c8f9eae92c67c7d53db4d69a703e3914
(('ffract', a), ('fsub', a, ('ffloor', a)), 'options->lower_ffract'),
(('fceil', a), ('fneg', ('ffloor', ('fneg', a))), 'options->lower_fceil'),
(('ffma', a, b, c), ('fadd', ('fmul', a, b), c), 'options->lower_ffma'),
- (('~fadd', ('fmul', a, b), c), ('ffma', a, b, c), 'options->fuse_ffma'),
+ # Always lower inexact ffma, because it will be fused back by late optimizations (nir_opt_algebraic_late).
+ (('~ffma', a, b, c), ('fadd', ('fmul', a, b), c), 'options->fuse_ffma'),
(('~fmul', ('fadd', ('iand', ('ineg', ('b2i', 'a@bool')), ('fmul', b, c)), '#d'), '#e'),
('bcsel', a, ('fmul', ('fadd', ('fmul', b, c), d), e), ('fmul', d, e))),
(('fneg', a), ('fsub', 0.0, a), 'options->lower_negate'),
(('ineg', a), ('isub', 0, a), 'options->lower_negate'),
(('iabs', a), ('imax', a, ('ineg', a)), 'options->lower_iabs'),
+ (('~fadd', ('fmul', a, b), c), ('ffma', a, b, c), 'options->fuse_ffma'),
# These are duplicated from the main optimizations table. The late
# patterns that rearrange expressions like x - .5 < 0 to x < .5 can create
if (changed)
si_nir_opts(nir, false);
+ /* Run late optimizations to fuse ffma. */
+ bool more_late_algebraic = true;
+ while (more_late_algebraic) {
+ more_late_algebraic = false;
+ NIR_PASS(more_late_algebraic, nir, nir_opt_algebraic_late);
+ NIR_PASS_V(nir, nir_opt_constant_folding);
+ NIR_PASS_V(nir, nir_copy_prop);
+ NIR_PASS_V(nir, nir_opt_dce);
+ NIR_PASS_V(nir, nir_opt_cse);
+ }
+
NIR_PASS_V(nir, nir_lower_bool_to_int32);
NIR_PASS_V(nir, nir_remove_dead_variables, nir_var_function_temp, NULL);