broadcom/compiler: move buffer loads to lower register pressure

author Iago Toral Quiroga <itoral@igalia.com>

Wed, 3 May 2023 09:52:37 +0000 (11:52 +0200)

committer Marge Bot <emma+marge@anholt.net>

Wed, 3 May 2023 13:01:58 +0000 (13:01 +0000)
author Iago Toral Quiroga <itoral@igalia.com>
Wed, 3 May 2023 09:52:37 +0000 (11:52 +0200)
committer Marge Bot <emma+marge@anholt.net>
Wed, 3 May 2023 13:01:58 +0000 (13:01 +0000)
diff --git a/src/broadcom/compiler/v3d_compiler.h b/src/broadcom/compiler/v3d_compiler.h

index 7a43c72..61d0c5c 100644 (file)
--- a/src/broadcom/compiler/v3d_compiler.h
+++ b/src/broadcom/compiler/v3d_compiler.h
@@ -715,6 +715,11 @@ struct v3d_compile {
          bool disable_constant_ubo_load_sorting;
          bool sorted_any_ubo_loads;
  
+        /* Moves UBO/SSBO loads right before their first user (nir_opt_move).
+         * This can reduce register pressure.
+         */
+        bool move_buffer_loads;
+
          /* Emits ldunif for each new uniform, even if the uniform was already
           * emitted in the same block. Useful to compile shaders with high
           * register pressure or to disable the optimization during uniform
diff --git a/src/broadcom/compiler/vir.c b/src/broadcom/compiler/vir.c

index 707c969..5355645 100644 (file)
--- a/src/broadcom/compiler/vir.c
+++ b/src/broadcom/compiler/vir.c
@@ -549,6 +549,7 @@ struct v3d_compiler_strategy {
          bool disable_gcm;
          bool disable_loop_unrolling;
          bool disable_ubo_load_sorting;
+        bool move_buffer_loads;
          bool disable_tmu_pipelining;
          uint32_t max_tmu_spills;
  };
@@ -583,6 +584,7 @@ vir_compile_init(const struct v3d_compiler *compiler,
          c->disable_general_tmu_sched = strategy->disable_general_tmu_sched;
          c->disable_tmu_pipelining = strategy->disable_tmu_pipelining;
          c->disable_constant_ubo_load_sorting = strategy->disable_ubo_load_sorting;
+        c->move_buffer_loads = strategy->move_buffer_loads;
          c->disable_gcm = strategy->disable_gcm;
          c->disable_loop_unrolling = V3D_DBG(NO_LOOP_UNROLL)
                  ? true : strategy->disable_loop_unrolling;
@@ -1649,8 +1651,11 @@ v3d_attempt_compile(struct v3d_compile *c)
          if (!c->disable_constant_ubo_load_sorting)
                  NIR_PASS(_, c->s, v3d_nir_sort_constant_ubo_loads, c);
  
+        const nir_move_options buffer_opts = c->move_buffer_loads ?
+                (nir_move_load_ubo | nir_move_load_ssbo) : 0;
          NIR_PASS(_, c->s, nir_opt_move, nir_move_load_uniform |
-                                       nir_move_const_undef);
+                                        nir_move_const_undef |
+                                        buffer_opts);
  
          v3d_nir_to_vir(c);
  }
@@ -1710,19 +1715,19 @@ int v3d_shaderdb_dump(struct v3d_compile *c,
   * because v3d_nir_to_vir will cap this to the actual minimum.
   */
  static const struct v3d_compiler_strategy strategies[] = {
-        /*0*/  { "default",                        4, 4, false, false, false, false, false,  0 },
-        /*1*/  { "disable general TMU sched",      4, 4, true,  false, false, false, false,  0 },
-        /*2*/  { "disable gcm",                    4, 4, true,  true,  false, false, false,  0 },
-        /*3*/  { "disable loop unrolling",         4, 4, true,  true,  true,  false, false,  0 },
-        /*4*/  { "disable UBO load sorting",       4, 4, true,  true,  true,  true,  false,  0 },
-        /*5*/  { "disable TMU pipelining",         4, 4, true,  true,  true,  true,  true,   0 },
-        /*6*/  { "lower thread count",             2, 1, false, false, false, false, false, -1 },
-        /*7*/  { "disable general TMU sched (2t)", 2, 1, true,  false, false, false, false, -1 },
-        /*8*/  { "disable gcm (2t)",               2, 1, true,  true,  false, false, false, -1 },
-        /*9*/  { "disable loop unrolling (2t)",    2, 1, true,  true,  true,  false, false, -1 },
-        /*10*/ { "disable UBO load sorting (2t)",  2, 1, true,  true,  true,  true,  false, -1 },
-        /*11*/ { "disable TMU pipelining (2t)",    2, 1, true,  true,  true,  true,  true,  -1 },
-        /*12*/ { "fallback scheduler",             2, 1, true,  true,  true,  true,  true,  -1 }
+        /*0*/  { "default",                        4, 4, false, false, false, false, false, false,  0 },
+        /*1*/  { "disable general TMU sched",      4, 4, true,  false, false, false, false, false,  0 },
+        /*2*/  { "disable gcm",                    4, 4, true,  true,  false, false, false, false,  0 },
+        /*3*/  { "disable loop unrolling",         4, 4, true,  true,  true,  false, false, false,  0 },
+        /*4*/  { "disable UBO load sorting",       4, 4, true,  true,  true,  true,  false, false,  0 },
+        /*5*/  { "disable TMU pipelining",         4, 4, true,  true,  true,  true,  false, true,   0 },
+        /*6*/  { "lower thread count",             2, 1, false, false, false, false, false, false, -1 },
+        /*7*/  { "disable general TMU sched (2t)", 2, 1, true,  false, false, false, false, false, -1 },
+        /*8*/  { "disable gcm (2t)",               2, 1, true,  true,  false, false, false, false, -1 },
+        /*9*/  { "disable loop unrolling (2t)",    2, 1, true,  true,  true,  false, false, false, -1 },
+        /*10*/ { "Move buffer loads (2t)",         2, 1, true,  true,  true,  true,  true,  false, -1 },
+        /*11*/ { "disable TMU pipelining (2t)",    2, 1, true,  true,  true,  true,  true,  false, -1 },
+        /*12*/ { "fallback scheduler",             2, 1, true,  true,  true,  true,  true,  false, -1 }
  };
  
  /**
@@ -1763,8 +1768,15 @@ skip_compile_strategy(struct v3d_compile *c, uint32_t idx)
             return !c->unrolled_any_loops;
     /* UBO load sorting: skip if we didn't sort any loads */
     case 4:
-   case 10:
             return !c->sorted_any_ubo_loads;
+   /* Move buffer loads: we assume any shader with difficult RA
+    * most likely has UBO / SSBO loads so we never try to skip.
+    * For now, we only try this for 2-thread compiles since it
+    * is expected to impact instruction counts and latency.
+    */
+   case 10:
+          assert(c->threads < 4);
+          return false;
     /* TMU pipelining: skip if we didn't pipeline any TMU ops */
     case 5:
     case 11:
author	Iago Toral Quiroga <itoral@igalia.com>
	Wed, 3 May 2023 09:52:37 +0000 (11:52 +0200)
committer	Marge Bot <emma+marge@anholt.net>
	Wed, 3 May 2023 13:01:58 +0000 (13:01 +0000)
src/broadcom/compiler/v3d_compiler.h		patch \| blob \| history
src/broadcom/compiler/vir.c		patch \| blob \| history