broadcom/compiler: only handle accumulator classes if present
authorIago Toral Quiroga <itoral@igalia.com>
Wed, 29 Sep 2021 10:10:31 +0000 (12:10 +0200)
committerMarge Bot <emma+marge@anholt.net>
Fri, 13 Oct 2023 22:37:41 +0000 (22:37 +0000)
Reviewed-by: Alejandro PiƱeiro <apinheiro@igalia.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/25450>

src/broadcom/compiler/vir_register_allocate.c

index f2df35c..e78ccb7 100644 (file)
@@ -53,6 +53,17 @@ get_class_bit_any(const struct v3d_device_info *devinfo)
         else
                 return CLASS_BITS_PHYS;
 }
+
+static uint8_t
+filter_class_bits(const struct v3d_device_info *devinfo, uint8_t class_bits)
+{
+   if (!devinfo->has_accumulators) {
+      assert(class_bits & CLASS_BITS_PHYS);
+      class_bits = CLASS_BITS_PHYS;
+   }
+   return class_bits;
+}
+
 static inline uint32_t
 temp_to_node(struct v3d_compile *c, uint32_t temp)
 {
@@ -413,8 +424,10 @@ v3d_setup_spill_base(struct v3d_compile *c)
                  */
                 if (c->spilling) {
                         int temp_class = CLASS_BITS_PHYS;
-                        if (i != c->spill_base.index)
+                        if (c->devinfo->has_accumulators &&
+                            i != c->spill_base.index) {
                                 temp_class |= CLASS_BITS_ACC;
+                        }
                         add_node(c, i, temp_class);
                 }
         }
@@ -473,14 +486,16 @@ v3d_emit_spill_tmua(struct v3d_compile *c,
          * temp will be used immediately so just like the uniform above we
          * can allow accumulators.
          */
+        int temp_class =
+                filter_class_bits(c->devinfo, CLASS_BITS_PHYS | CLASS_BITS_ACC);
         if (!fill_dst) {
                 struct qreg dst = vir_TMUWT(c);
                 assert(dst.file == QFILE_TEMP);
-                add_node(c, dst.index, CLASS_BITS_PHYS | CLASS_BITS_ACC);
+                add_node(c, dst.index, temp_class);
         } else {
                 *fill_dst = vir_LDTMU(c);
                 assert(fill_dst->file == QFILE_TEMP);
-                add_node(c, fill_dst->index, CLASS_BITS_PHYS | CLASS_BITS_ACC);
+                add_node(c, fill_dst->index, temp_class);
         }
 
         /* Temps across the thread switch we injected can't be assigned to
@@ -662,8 +677,10 @@ v3d_spill_reg(struct v3d_compile *c, int *acc_nodes, int spill_temp)
                                          * instruction immediately after so we
                                          * can use ACC.
                                          */
-                                        add_node(c, temp.index, CLASS_BITS_PHYS |
-                                                                CLASS_BITS_ACC);
+                                        int temp_class =
+                                                filter_class_bits(c->devinfo, CLASS_BITS_PHYS |
+                                                                              CLASS_BITS_ACC);
+                                        add_node(c, temp.index, temp_class);
                                 } else {
                                         /* If we have a postponed spill, we
                                          * don't need a fill as the temp would
@@ -941,6 +958,7 @@ vir_init_reg_sets(struct v3d_compiler *compiler)
                 compiler->reg_class_phys[threads] =
                         ra_alloc_contig_reg_class(compiler->regs, 1);
 
+                /* Init physical regs */
                 for (int i = phys_index;
                      i < phys_index + (PHYS_COUNT >> threads); i++) {
                         if (compiler->devinfo->has_accumulators)
@@ -949,16 +967,15 @@ vir_init_reg_sets(struct v3d_compiler *compiler)
                         ra_class_add_reg(compiler->reg_class_any[threads], i);
                 }
 
+                /* Init accumulator regs */
                 if (compiler->devinfo->has_accumulators) {
                         for (int i = ACC_INDEX + 0; i < ACC_INDEX + ACC_COUNT - 1; i++) {
                                 ra_class_add_reg(compiler->reg_class_phys_or_acc[threads], i);
                                 ra_class_add_reg(compiler->reg_class_any[threads], i);
                         }
-                }
-                /* r5 can only store a single 32-bit value, so not much can
-                 * use it.
-                 */
-                if (compiler->devinfo->has_accumulators) {
+                        /* r5 can only store a single 32-bit value, so not much can
+                         * use it.
+                         */
                         ra_class_add_reg(compiler->reg_class_r5[threads],
                                          ACC_INDEX + 5);
                         ra_class_add_reg(compiler->reg_class_any[threads],
@@ -1081,21 +1098,23 @@ update_graph_and_reg_classes_for_inst(struct v3d_compile *c, int *acc_nodes,
                  * because ldunif has usually a shorter lifespan, allowing for
                  * more accumulator reuse and QPU merges.
                  */
-                if (!inst->qpu.sig.ldunif) {
-                        uint8_t class_bits =
-                                get_temp_class_bits(c, inst->dst.index) &
-                                ~CLASS_BITS_R5;
-                        set_temp_class_bits(c, inst->dst.index,
-                                            class_bits);
-
-                } else {
-                        /* Until V3D 4.x, we could only load a uniform
-                         * to r5, so we'll need to spill if uniform
-                         * loads interfere with each other.
-                         */
-                        if (c->devinfo->ver < 40) {
+                if (c->devinfo->has_accumulators) {
+                        if (!inst->qpu.sig.ldunif) {
+                                uint8_t class_bits =
+                                        get_temp_class_bits(c, inst->dst.index) &
+                                        ~CLASS_BITS_R5;
                                 set_temp_class_bits(c, inst->dst.index,
-                                                    CLASS_BITS_R5);
+                                                    class_bits);
+
+                        } else {
+                                /* Until V3D 4.x, we could only load a uniform
+                                 * to r5, so we'll need to spill if uniform
+                                 * loads interfere with each other.
+                                 */
+                                if (c->devinfo->ver < 40) {
+                                        set_temp_class_bits(c, inst->dst.index,
+                                                            CLASS_BITS_R5);
+                                }
                         }
                 }
         }
@@ -1152,8 +1171,10 @@ v3d_register_allocate(struct v3d_compile *c)
                         c->thread_index--;
         }
 
-        c->g = ra_alloc_interference_graph(c->compiler->regs,
-                                           c->num_temps + ARRAY_SIZE(acc_nodes));
+        unsigned num_ra_nodes = c->num_temps;
+        if (c->devinfo->has_accumulators)
+                num_ra_nodes += ARRAY_SIZE(acc_nodes);
+        c->g = ra_alloc_interference_graph(c->compiler->regs, num_ra_nodes);
         ra_set_select_reg_callback(c->g, v3d_ra_select_callback, &callback_data);
 
         /* Make some fixed nodes for the accumulators, which we will need to
@@ -1162,8 +1183,8 @@ v3d_register_allocate(struct v3d_compile *c)
          * live in, but the classes take up a lot of memory to set up, so we
          * don't want to make too many.
          */
-        for (uint32_t i = 0; i < ACC_COUNT + c->num_temps; i++) {
-                if (i < ACC_COUNT) {
+        for (uint32_t i = 0; i < num_ra_nodes; i++) {
+                if (c->devinfo->has_accumulators && i < ACC_COUNT) {
                         acc_nodes[i] = i;
                         ra_set_node_reg(c->g, acc_nodes[i], ACC_INDEX + i);
                         c->nodes.info[i].priority = 0;