broadcom/compiler: skip unnecessary unifa writes
authorIago Toral Quiroga <itoral@igalia.com>
Thu, 18 Feb 2021 07:32:13 +0000 (08:32 +0100)
committerMarge Bot <eric+marge@anholt.net>
Tue, 23 Feb 2021 08:08:01 +0000 (08:08 +0000)
If a new UBO load happens to read exactly at the offset right after the
previous UBO load (something that is fairly common, for example when
reading a matrix), we can skip the unifa write (with its 3 delay slots)
and just continue to call ldunifa to continue reading consecutive addresses.

Reviewed-by: Alejandro PiƱeiro <apinheiro@igalia.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/9128>

src/broadcom/compiler/nir_to_vir.c
src/broadcom/compiler/v3d_compiler.h

index d27b184..bc9473c 100644 (file)
@@ -2589,17 +2589,34 @@ ntq_emit_load_ubo_unifa(struct v3d_compile *c, nir_intrinsic_instr *instr)
         if (c->key->environment == V3D_ENVIRONMENT_OPENGL)
                 index++;
 
-        struct qreg base_offset =
-                vir_uniform(c, QUNIFORM_UBO_ADDR,
-                            v3d_unit_data_create(index, const_offset));
-        const_offset = 0;
-
-        struct qreg unifa = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_UNIFA);
-        if (!dynamic_src) {
-                vir_MOV_dest(c, unifa, base_offset);
+        /* We can only keep track of the last unifa address we used with
+         * constant offset loads.
+         */
+        bool skip_unifa = false;
+        if (dynamic_src) {
+                c->last_unifa_block = NULL;
+        } else if (c->cur_block == c->last_unifa_block &&
+                   c->last_unifa_index == index &&
+                   c->last_unifa_offset == const_offset) {
+                skip_unifa = true;
         } else {
-                vir_ADD_dest(c, unifa, base_offset,
-                             ntq_get_src(c, instr->src[1], 0));
+                c->last_unifa_block = c->cur_block;
+                c->last_unifa_index = index;
+                c->last_unifa_offset = const_offset;
+        }
+
+        if (!skip_unifa) {
+                struct qreg base_offset =
+                        vir_uniform(c, QUNIFORM_UBO_ADDR,
+                                    v3d_unit_data_create(index, const_offset));
+
+                struct qreg unifa = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_UNIFA);
+                if (!dynamic_src) {
+                        vir_MOV_dest(c, unifa, base_offset);
+                } else {
+                        vir_ADD_dest(c, unifa, base_offset,
+                                     ntq_get_src(c, instr->src[1], 0));
+                }
         }
 
         for (uint32_t i = 0; i < nir_intrinsic_dest_components(instr); i++) {
@@ -2608,6 +2625,7 @@ ntq_emit_load_ubo_unifa(struct v3d_compile *c, nir_intrinsic_instr *instr)
                 ldunifa->qpu.sig.ldunifa = true;
                 struct qreg data = vir_emit_def(c, ldunifa);
                 ntq_store_dest(c, &instr->dest, i, vir_MOV(c, data));
+                c->last_unifa_offset += 4;
         }
 }
 
index 8e83a8e..e1f3511 100644 (file)
@@ -650,6 +650,15 @@ struct v3d_compile {
          */
         bool disable_ldunif_opt;
 
+        /* Last UBO index and offset used with a unifa/ldunifa sequence and the
+         * block where it was emitted. This is used to skip unifa writes (and
+         * their 3 delay slot) when the next UBO load reads right after the
+         * previous one in the same block.
+         */
+        struct qblock *last_unifa_block;
+        int32_t last_unifa_index;
+        uint32_t last_unifa_offset;
+
         /* State for whether we're executing on each channel currently.  0 if
          * yes, otherwise a block number + 1 that the channel jumped to.
          */