broadcom/compiler: add a strategy to disable scheduling of general TMU reads
authorIago Toral Quiroga <itoral@igalia.com>
Mon, 7 Mar 2022 13:42:39 +0000 (14:42 +0100)
committerMarge Bot <emma+marge@anholt.net>
Wed, 9 Mar 2022 15:53:04 +0000 (15:53 +0000)
This can add quite a bit of register pressure so it makes sense to disable it
to prevent us from dropping to 2 threads or increase spills:

total instructions in shared programs: 12672813 -> 12642413 (-0.24%)
instructions in affected programs: 256721 -> 226321 (-11.84%)
helped: 719
HURT: 77

total threads in shared programs: 415534 -> 416322 (0.19%)
threads in affected programs: 788 -> 1576 (100.00%)
helped: 394
HURT: 0

total uniforms in shared programs: 3711370 -> 3703861 (-0.20%)
uniforms in affected programs: 28859 -> 21350 (-26.02%)
helped: 204
HURT: 455

total max-temps in shared programs: 2159439 -> 2150686 (-0.41%)
max-temps in affected programs: 32945 -> 24192 (-26.57%)
helped: 585
HURT: 47

total spills in shared programs: 5966 -> 3255 (-45.44%)
spills in affected programs: 2933 -> 222 (-92.43%)
helped: 192
HURT: 4

total fills in shared programs: 9328 -> 4630 (-50.36%)
fills in affected programs: 5184 -> 486 (-90.62%)
helped: 196
HURT: 0

Compared to the stats before adding scheduling of non-filtered
memory reads we see we that we have now gotten back all that was
lost and then some:

total instructions in shared programs: 12663186 -> 12642413 (-0.16%)
instructions in affected programs: 2051803 -> 2031030 (-1.01%)
helped: 4885
HURT: 3338

total threads in shared programs: 415870 -> 416322 (0.11%)
threads in affected programs: 896 -> 1348 (50.45%)
helped: 300
HURT: 74

total uniforms in shared programs: 3711629 -> 3703861 (-0.21%)
uniforms in affected programs: 158766 -> 150998 (-4.89%)
helped: 1973
HURT: 499

total max-temps in shared programs: 2138857 -> 2150686 (0.55%)
max-temps in affected programs: 177920 -> 189749 (6.65%)
helped: 2666
HURT: 2035

total spills in shared programs: 3860 -> 3255 (-15.67%)
spills in affected programs: 2653 -> 2048 (-22.80%)
helped: 77
HURT: 21

total fills in shared programs: 5573 -> 4630 (-16.92%)
fills in affected programs: 3839 -> 2896 (-24.56%)
helped: 81
HURT: 15

total sfu-stalls in shared programs: 39583 -> 38154 (-3.61%)
sfu-stalls in affected programs: 8993 -> 7564 (-15.89%)
helped: 1808
HURT: 1038

total nops in shared programs: 324894 -> 323685 (-0.37%)
nops in affected programs: 30362 -> 29153 (-3.98%)
helped: 2513
HURT: 2077

Reviewed-by: Alejandro PiƱeiro <apinheiro@igalia.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/15276>

src/broadcom/compiler/nir_to_vir.c
src/broadcom/compiler/v3d_compiler.h
src/broadcom/compiler/vir.c

index 7125c35..733778a 100644 (file)
@@ -3200,8 +3200,10 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)
 
         case nir_intrinsic_load_ubo:
         case nir_intrinsic_load_ssbo:
-                if (!ntq_emit_load_unifa(c, instr))
+                if (!ntq_emit_load_unifa(c, instr)) {
                         ntq_emit_tmu_general(c, instr, false);
+                        c->has_general_tmu_load = true;
+                }
                 break;
 
         case nir_intrinsic_ssbo_atomic_add:
@@ -3228,14 +3230,17 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)
         case nir_intrinsic_shared_atomic_xor:
         case nir_intrinsic_shared_atomic_exchange:
         case nir_intrinsic_shared_atomic_comp_swap:
-        case nir_intrinsic_load_shared:
         case nir_intrinsic_store_shared:
-        case nir_intrinsic_load_scratch:
         case nir_intrinsic_store_scratch:
                 ntq_emit_tmu_general(c, instr, true);
                 break;
 
-        case nir_intrinsic_image_load:
+        case nir_intrinsic_load_scratch:
+        case nir_intrinsic_load_shared:
+                ntq_emit_tmu_general(c, instr, true);
+                c->has_general_tmu_load = true;
+                break;
+
         case nir_intrinsic_image_store:
         case nir_intrinsic_image_atomic_add:
         case nir_intrinsic_image_atomic_imin:
@@ -3250,6 +3255,15 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)
                 v3d40_vir_emit_image_load_store(c, instr);
                 break;
 
+        case nir_intrinsic_image_load:
+                v3d40_vir_emit_image_load_store(c, instr);
+                /* Not really a general TMU load, but we only use this flag
+                 * for NIR scheduling and we do schedule these under the same
+                 * policy as general TMU.
+                 */
+                c->has_general_tmu_load = true;
+                break;
+
         case nir_intrinsic_get_ssbo_size:
                 ntq_store_dest(c, &instr->dest, 0,
                                vir_uniform(c, QUNIFORM_GET_SSBO_SIZE,
index a0d48e4..c978995 100644 (file)
@@ -710,6 +710,11 @@ struct v3d_compile {
         bool disable_loop_unrolling;
         bool unrolled_any_loops;
 
+        /* Disables scheduling of general TMU loads (and unfiltered image load).
+         */
+        bool disable_general_tmu_sched;
+        bool has_general_tmu_load;
+
         /* Minimum number of threads we are willing to use to register allocate
          * a shader with the current compilation strategy. This only prevents
          * us from lowering the thread count to register allocate successfully,
index 48405d0..60dcc38 100644 (file)
@@ -550,6 +550,7 @@ vir_compile_init(const struct v3d_compiler *compiler,
                  uint32_t max_threads,
                  uint32_t min_threads_for_reg_alloc,
                  uint32_t max_tmu_spills,
+                 bool disable_general_tmu_sched,
                  bool disable_loop_unrolling,
                  bool disable_constant_ubo_load_sorting,
                  bool disable_tmu_pipelining,
@@ -569,6 +570,7 @@ vir_compile_init(const struct v3d_compiler *compiler,
         c->min_threads_for_reg_alloc = min_threads_for_reg_alloc;
         c->max_tmu_spills = max_tmu_spills;
         c->fallback_scheduler = fallback_scheduler;
+        c->disable_general_tmu_sched = disable_general_tmu_sched;
         c->disable_tmu_pipelining = disable_tmu_pipelining;
         c->disable_constant_ubo_load_sorting = disable_constant_ubo_load_sorting;
         c->disable_loop_unrolling = V3D_DEBUG & V3D_DEBUG_NO_LOOP_UNROLL
@@ -1122,6 +1124,8 @@ v3d_intrinsic_dependency_cb(nir_intrinsic_instr *intr,
 static unsigned
 v3d_instr_delay_cb(nir_instr *instr, void *data)
 {
+   struct v3d_compile *c = (struct v3d_compile *) data;
+
    switch (instr->type) {
    case nir_instr_type_ssa_undef:
    case nir_instr_type_load_const:
@@ -1134,18 +1138,22 @@ v3d_instr_delay_cb(nir_instr *instr, void *data)
       return 1;
 
    case nir_instr_type_intrinsic: {
-      nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
-      switch (intr->intrinsic) {
-      case nir_intrinsic_load_ssbo:
-      case nir_intrinsic_load_scratch:
-      case nir_intrinsic_load_shared:
-      case nir_intrinsic_image_load:
-         return 30;
-      case nir_intrinsic_load_ubo:
-         if (nir_src_is_divergent(intr->src[1]))
+      if (!c->disable_general_tmu_sched) {
+         nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
+         switch (intr->intrinsic) {
+         case nir_intrinsic_load_ssbo:
+         case nir_intrinsic_load_scratch:
+         case nir_intrinsic_load_shared:
+         case nir_intrinsic_image_load:
             return 30;
-         FALLTHROUGH;
-      default:
+         case nir_intrinsic_load_ubo:
+            if (nir_src_is_divergent(intr->src[1]))
+               return 30;
+            FALLTHROUGH;
+         default:
+            return 1;
+         }
+      } else {
          return 1;
       }
       break;
@@ -1674,20 +1682,23 @@ struct v3d_compiler_strategy {
         const char *name;
         uint32_t max_threads;
         uint32_t min_threads;
+        bool disable_general_tmu_sched;
         bool disable_loop_unrolling;
         bool disable_ubo_load_sorting;
         bool disable_tmu_pipelining;
         uint32_t max_tmu_spills;
 } static const strategies[] = {
-  /*0*/ { "default",                        4, 4, false, false, false,  0 },
-  /*1*/ { "disable loop unrolling",         4, 4, true,  false, false,  0 },
-  /*2*/ { "disable UBO load sorting",       4, 4, true,  true,  false,  0 },
-  /*3*/ { "disable TMU pipelining",         4, 4, true,  true,  true,   0 },
-  /*4*/ { "lower thread count",             2, 1, false, false, false, -1 },
-  /*5*/ { "disable loop unrolling (ltc)",   2, 1, true,  false, false, -1 },
-  /*6*/ { "disable UBO load sorting (ltc)", 2, 1, true,  true,  false, -1 },
-  /*7*/ { "disable TMU pipelining (ltc)",   2, 1, true,  true,  true,  -1 },
-  /*8*/ { "fallback scheduler",             2, 1, true,  true,  true,  -1 }
+  /*0*/  { "default",                        4, 4, false, false, false, false,  0 },
+  /*1*/  { "disable general TMU sched",      4, 4, true,  false, false, false,  0 },
+  /*2*/  { "disable loop unrolling",         4, 4, true,  true,  false, false,  0 },
+  /*3*/  { "disable UBO load sorting",       4, 4, true,  true,  true,  false,  0 },
+  /*4*/  { "disable TMU pipelining",         4, 4, true,  true,  true,  true,   0 },
+  /*5*/  { "lower thread count",             2, 1, false, false, false, false, -1 },
+  /*6*/  { "disable general TMU sched (2t)", 2, 1, true,  false, false, false, -1 },
+  /*7*/  { "disable loop unrolling (2t)",    2, 1, true,  true,  false, false, -1 },
+  /*8*/  { "disable UBO load sorting (2t)",  2, 1, true,  true,  true,  false, -1 },
+  /*9*/  { "disable TMU pipelining (2t)",    2, 1, true,  true,  true,  true,  -1 },
+  /*10*/ { "fallback scheduler",             2, 1, true,  true,  true,  true,  -1 }
 };
 
 /**
@@ -1695,7 +1706,7 @@ struct v3d_compiler_strategy {
  * attempt disabling it alone won't allow us to compile the shader successfuly,
  * since we'll end up with the same code. Detect these scenarios so we can
  * avoid wasting time with useless compiles. We should also consider if the
- * strategy changes other aspects of the compilation process though, like
+ * gy changes other aspects of the compilation process though, like
  * spilling, and not skip it in that case.
  */
 static bool
@@ -1714,20 +1725,24 @@ skip_compile_strategy(struct v3d_compile *c, uint32_t idx)
    }
 
    switch (idx) {
-   /* Loop unrolling: skip if we didn't unroll any loops */
+   /* General TMU sched.: skip if we didn't emit any TMU loads */
    case 1:
-   case 5:
+   case 6:
+           return !c->has_general_tmu_load;
+   /* Loop unrolling: skip if we didn't unroll any loops */
+   case 2:
+   case 7:
            return !c->unrolled_any_loops;
    /* UBO load sorting: skip if we didn't sort any loads */
-   case 2:
-   case 6:
+   case 3:
+   case 8:
            return !c->sorted_any_ubo_loads;
    /* TMU pipelining: skip if we didn't pipeline any TMU ops */
-   case 3:
-   case 7:
+   case 4:
+   case 9:
            return !c->pipelined_any_tmu;
    /* Lower thread count: skip if we already tried less that 4 threads */
-   case 4:
+   case 5:
           return c->threads < 4;
    default:
            return false;
@@ -1780,6 +1795,7 @@ uint64_t *v3d_compile(const struct v3d_compiler *compiler,
                                      strategies[strat].max_threads,
                                      strategies[strat].min_threads,
                                      strategies[strat].max_tmu_spills,
+                                     strategies[strat].disable_general_tmu_sched,
                                      strategies[strat].disable_loop_unrolling,
                                      strategies[strat].disable_ubo_load_sorting,
                                      strategies[strat].disable_tmu_pipelining,