vc4: Implement live intervals using a CFG.
authorEric Anholt <eric@anholt.net>
Thu, 10 Mar 2016 20:53:57 +0000 (12:53 -0800)
committerEric Anholt <eric@anholt.net>
Wed, 13 Jul 2016 00:41:59 +0000 (17:41 -0700)
Right now our CFG is always a trivial single basic block, but that will
change when enable loops.

src/gallium/drivers/vc4/Makefile.sources
src/gallium/drivers/vc4/vc4_qir.c
src/gallium/drivers/vc4/vc4_qir.h
src/gallium/drivers/vc4/vc4_qir_live_variables.c [new file with mode: 0644]
src/gallium/drivers/vc4/vc4_qpu.h
src/gallium/drivers/vc4/vc4_register_allocate.c

index 612a0a4..76e46f5 100644 (file)
@@ -31,6 +31,7 @@ C_SOURCES := \
        vc4_opt_vpm.c \
        vc4_program.c \
        vc4_qir.c \
+       vc4_qir_live_variables.c \
        vc4_qir_lower_uniforms.c \
        vc4_qir_schedule.c \
        vc4_qir_validate.c \
index c23b332..3f59fce 100644 (file)
@@ -224,6 +224,53 @@ qir_writes_r4(struct qinst *inst)
         }
 }
 
+uint8_t
+qir_channels_written(struct qinst *inst)
+{
+        if (qir_is_mul(inst)) {
+                switch (inst->dst.pack) {
+                case QPU_PACK_MUL_NOP:
+                case QPU_PACK_MUL_8888:
+                        return 0xf;
+                case QPU_PACK_MUL_8A:
+                        return 0x1;
+                case QPU_PACK_MUL_8B:
+                        return 0x2;
+                case QPU_PACK_MUL_8C:
+                        return 0x4;
+                case QPU_PACK_MUL_8D:
+                        return 0x8;
+                }
+        } else {
+                switch (inst->dst.pack) {
+                case QPU_PACK_A_NOP:
+                case QPU_PACK_A_8888:
+                case QPU_PACK_A_8888_SAT:
+                case QPU_PACK_A_32_SAT:
+                        return 0xf;
+                case QPU_PACK_A_8A:
+                case QPU_PACK_A_8A_SAT:
+                        return 0x1;
+                case QPU_PACK_A_8B:
+                case QPU_PACK_A_8B_SAT:
+                        return 0x2;
+                case QPU_PACK_A_8C:
+                case QPU_PACK_A_8C_SAT:
+                        return 0x4;
+                case QPU_PACK_A_8D:
+                case QPU_PACK_A_8D_SAT:
+                        return 0x8;
+                case QPU_PACK_A_16A:
+                case QPU_PACK_A_16A_SAT:
+                        return 0x3;
+                case QPU_PACK_A_16B:
+                case QPU_PACK_A_16B_SAT:
+                        return 0xc;
+                }
+        }
+        unreachable("Bad pack field");
+}
+
 static void
 qir_print_reg(struct vc4_compile *c, struct qreg reg, bool write)
 {
index f3e6575..5099b7f 100644 (file)
@@ -38,6 +38,7 @@
 
 #include "vc4_screen.h"
 #include "vc4_qpu_defines.h"
+#include "vc4_qpu.h"
 #include "kernel/vc4_packet.h"
 #include "pipe/p_state.h"
 
@@ -353,6 +354,14 @@ struct qblock {
         struct qblock *successors[2];
 
         int index;
+
+        /** @{ used by vc4_qir_live_variables.c */
+        BITSET_WORD *def;
+        BITSET_WORD *use;
+        BITSET_WORD *live_in;
+        BITSET_WORD *live_out;
+        int start_ip, end_ip;
+        /** @} */
 };
 
 struct vc4_compile {
@@ -422,6 +431,9 @@ struct vc4_compile {
         struct vc4_fs_key *fs_key;
         struct vc4_vs_key *vs_key;
 
+        /* Live ranges of temps. */
+        int *temp_start, *temp_end;
+
         uint32_t *uniform_data;
         enum quniform_contents *uniform_contents;
         uint32_t uniform_array_size;
@@ -488,6 +500,7 @@ struct qreg qir_emit_def(struct vc4_compile *c, struct qinst *inst);
 struct qinst *qir_emit_nondef(struct vc4_compile *c, struct qinst *inst);
 
 struct qreg qir_get_temp(struct vc4_compile *c);
+void qir_calculate_live_intervals(struct vc4_compile *c);
 int qir_get_op_nsrc(enum qop qop);
 bool qir_reg_equals(struct qreg a, struct qreg b);
 bool qir_has_side_effects(struct vc4_compile *c, struct qinst *inst);
@@ -499,6 +512,7 @@ bool qir_is_float_input(struct qinst *inst);
 bool qir_depends_on_flags(struct qinst *inst);
 bool qir_writes_r4(struct qinst *inst);
 struct qreg qir_follow_movs(struct vc4_compile *c, struct qreg reg);
+uint8_t qir_channels_written(struct qinst *inst);
 
 void qir_dump(struct vc4_compile *c);
 void qir_dump_inst(struct vc4_compile *c, struct qinst *inst);
@@ -667,7 +681,7 @@ qir_SEL(struct vc4_compile *c, uint8_t cond, struct qreg src0, struct qreg src1)
         struct qinst *a = qir_MOV_dest(c, t, src0);
         struct qinst *b = qir_MOV_dest(c, t, src1);
         a->cond = cond;
-        b->cond = cond ^ 1;
+        b->cond = qpu_cond_complement(cond);
         return t;
 }
 
diff --git a/src/gallium/drivers/vc4/vc4_qir_live_variables.c b/src/gallium/drivers/vc4/vc4_qir_live_variables.c
new file mode 100644 (file)
index 0000000..eac350a
--- /dev/null
@@ -0,0 +1,316 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ * Copyright © 2016 Broadcom
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#define MAX_INSTRUCTION (1 << 30)
+
+#include "util/ralloc.h"
+#include "util/register_allocate.h"
+#include "vc4_context.h"
+#include "vc4_qir.h"
+
+struct partial_update_state {
+        struct qinst *insts[4];
+        uint8_t channels;
+};
+
+static uint32_t
+int_hash(const void *key)
+{
+        return _mesa_hash_data(key, sizeof(int));
+}
+
+static bool
+int_compare(const void *key1, const void *key2)
+{
+        return *(const int *)key1 == *(const int *)key2;
+}
+
+static int
+qir_reg_to_var(struct qreg reg)
+{
+        if (reg.file == QFILE_TEMP)
+                return reg.index;
+
+        return -1;
+}
+
+static void
+qir_setup_use(struct vc4_compile *c, struct qblock *block, int ip,
+              struct qreg src)
+{
+        int var = qir_reg_to_var(src);
+        if (var == -1)
+                return;
+
+        c->temp_start[var] = MIN2(c->temp_start[var], ip);
+        c->temp_end[var] = MAX2(c->temp_end[var], ip);
+
+        /* The use[] bitset marks when the block makes
+         * use of a variable without having completely
+         * defined that variable within the block.
+         */
+        if (!BITSET_TEST(block->def, var))
+                BITSET_SET(block->use, var);
+}
+
+static struct partial_update_state *
+get_partial_update_state(struct hash_table *partial_update_ht,
+                         struct qinst *inst)
+{
+        struct hash_entry *entry =
+                _mesa_hash_table_search(partial_update_ht,
+                                        &inst->dst.index);
+        if (entry)
+                return entry->data;
+
+        struct partial_update_state *state =
+                rzalloc(partial_update_ht, struct partial_update_state);
+
+        _mesa_hash_table_insert(partial_update_ht, &inst->dst.index, state);
+
+        return state;
+}
+
+static void
+qir_setup_def(struct vc4_compile *c, struct qblock *block, int ip,
+              struct hash_table *partial_update_ht, struct qinst *inst)
+{
+        /* The def[] bitset marks when an initialization in a
+         * block completely screens off previous updates of
+         * that variable.
+         */
+        int var = qir_reg_to_var(inst->dst);
+        if (var == -1)
+                return;
+
+        c->temp_start[var] = MIN2(c->temp_start[var], ip);
+        c->temp_end[var] = MAX2(c->temp_end[var], ip);
+
+        /* If we've already tracked this as a def, or already used it within
+         * the block, there's nothing to do.
+         */
+        if (BITSET_TEST(block->use, var) || BITSET_TEST(block->def, var))
+                return;
+
+        /* Easy, common case: unconditional full register update. */
+        if (inst->cond == QPU_COND_ALWAYS && !inst->dst.pack) {
+                BITSET_SET(block->def, var);
+                return;
+        }
+
+        /* Finally, look at the condition code and packing and mark it as a
+         * def.  We need to make sure that we understand sequences
+         * instructions like:
+         *
+         *     mov.zs t0, t1
+         *     mov.zc t0, t2
+         *
+         * or:
+         *
+         *     mmov t0.8a, t1
+         *     mmov t0.8b, t2
+         *     mmov t0.8c, t3
+         *     mmov t0.8d, t4
+         *
+         * as defining the temp within the block, because otherwise dst's live
+         * range will get extended up the control flow to the top of the
+         * program.
+         */
+        struct partial_update_state *state =
+                get_partial_update_state(partial_update_ht, inst);
+        uint8_t mask = qir_channels_written(inst);
+
+        if (inst->cond == QPU_COND_ALWAYS) {
+                state->channels |= mask;
+        } else {
+                for (int i = 0; i < 4; i++) {
+                        if (!(mask & (1 << i)))
+                                continue;
+
+                        if (state->insts[i] &&
+                            state->insts[i]->cond ==
+                            qpu_cond_complement(inst->cond))
+                                state->channels |= 1 << i;
+                        else
+                                state->insts[i] = inst;
+                }
+        }
+
+        if (state->channels == 0xf)
+                BITSET_SET(block->def, var);
+}
+
+static void
+sf_state_clear(struct hash_table *partial_update_ht)
+{
+        struct hash_entry *entry;
+
+        hash_table_foreach(partial_update_ht, entry) {
+                struct partial_update_state *state = entry->data;
+
+                for (int i = 0; i < 4; i++) {
+                        if (state->insts[i] && state->insts[i]->cond)
+                                state->insts[i] = NULL;
+                }
+        }
+}
+
+/* Sets up the def/use arrays for when variables are used-before-defined or
+ * defined-before-used in the block.
+ *
+ * Also initializes the temp_start/temp_end to cover just the instruction IPs
+ * where the variable is used, which will be extended later in
+ * qir_compute_start_end().
+ */
+static void
+qir_setup_def_use(struct vc4_compile *c)
+{
+        struct hash_table *partial_update_ht =
+                _mesa_hash_table_create(c, int_hash, int_compare);
+        int ip = 0;
+
+        qir_for_each_block(block, c) {
+                block->start_ip = ip;
+
+                _mesa_hash_table_clear(partial_update_ht, NULL);
+
+                qir_for_each_inst(inst, block) {
+                        for (int i = 0; i < qir_get_op_nsrc(inst->op); i++)
+                                qir_setup_use(c, block, ip, inst->src[i]);
+
+                        qir_setup_def(c, block, ip, partial_update_ht, inst);
+
+                        if (inst->sf)
+                                sf_state_clear(partial_update_ht);
+
+                        switch (inst->op) {
+                        case QOP_FRAG_Z:
+                        case QOP_FRAG_W:
+                                /* The payload registers have values
+                                 * implicitly loaded at the start of the
+                                 * program.
+                                 */
+                                if (inst->dst.file == QFILE_TEMP)
+                                        c->temp_start[inst->dst.index] = 0;
+                                break;
+                        default:
+                                break;
+                        }
+                        ip++;
+                }
+                block->end_ip = ip;
+        }
+
+        _mesa_hash_table_destroy(partial_update_ht, NULL);
+}
+
+static bool
+qir_live_variables_dataflow(struct vc4_compile *c, int bitset_words)
+{
+        bool cont = false;
+
+        qir_for_each_block_rev(block, c) {
+                /* Update live_out: Any successor using the variable
+                 * on entrance needs us to have the variable live on
+                 * exit.
+                 */
+                qir_for_each_successor(succ, block) {
+                        for (int i = 0; i < bitset_words; i++) {
+                                BITSET_WORD new_live_out = (succ->live_in[i] &
+                                                            ~block->live_out[i]);
+                                if (new_live_out) {
+                                        block->live_out[i] |= new_live_out;
+                                        cont = true;
+                                }
+                        }
+                }
+
+                /* Update live_in */
+                for (int i = 0; i < bitset_words; i++) {
+                        BITSET_WORD new_live_in = (block->use[i] |
+                                                   (block->live_out[i] &
+                                                    ~block->def[i]));
+                        if (new_live_in & ~block->live_in[i]) {
+                                block->live_in[i] |= new_live_in;
+                                cont = true;
+                        }
+                }
+        }
+
+        return cont;
+}
+
+/**
+ * Extend the start/end ranges for each variable to account for the
+ * new information calculated from control flow.
+ */
+static void
+qir_compute_start_end(struct vc4_compile *c, int num_vars)
+{
+        qir_for_each_block(block, c) {
+                for (int i = 0; i < num_vars; i++) {
+                        if (BITSET_TEST(block->live_in, i)) {
+                                c->temp_start[i] = MIN2(c->temp_start[i],
+                                                        block->start_ip);
+                                c->temp_end[i] = MAX2(c->temp_end[i],
+                                                      block->start_ip);
+                        }
+
+                        if (BITSET_TEST(block->live_out, i)) {
+                                c->temp_start[i] = MIN2(c->temp_start[i],
+                                                        block->end_ip);
+                                c->temp_end[i] = MAX2(c->temp_end[i],
+                                                      block->end_ip);
+                        }
+                }
+        }
+}
+
+void
+qir_calculate_live_intervals(struct vc4_compile *c)
+{
+        int bitset_words = BITSET_WORDS(c->num_temps);
+
+        c->temp_start = reralloc(c, c->temp_start, int, c->num_temps);
+        c->temp_end = reralloc(c, c->temp_end, int, c->num_temps);
+
+        for (int i = 0; i < c->num_temps; i++) {
+                c->temp_start[i] = MAX_INSTRUCTION;
+                c->temp_end[i] = -1;
+        }
+
+        qir_for_each_block(block, c) {
+                block->def = reralloc(c, block->def, BITSET_WORD, bitset_words);
+                block->use = reralloc(c, block->use, BITSET_WORD, bitset_words);
+                block->live_in = reralloc(c, block->live_in, BITSET_WORD, bitset_words);
+                block->live_out = reralloc(c, block->live_out, BITSET_WORD, bitset_words);
+        }
+
+        qir_setup_def_use(c);
+
+        while (qir_live_variables_dataflow(c, bitset_words))
+                ;
+
+        qir_compute_start_end(c, c->num_temps);
+}
index b7dab16..83fa36e 100644 (file)
@@ -153,6 +153,12 @@ bool qpu_inst_is_tlb(uint64_t inst) ATTRIBUTE_CONST;
 int qpu_num_sf_accesses(uint64_t inst) ATTRIBUTE_CONST;
 void qpu_serialize_one_inst(struct vc4_compile *c, uint64_t inst);
 
+static inline enum qpu_cond
+qpu_cond_complement(enum qpu_cond cond)
+{
+        return cond ^ 1;
+}
+
 static inline uint64_t
 qpu_load_imm_f(struct qpu_reg dst, float val)
 {
index bb5a396..203b459 100644 (file)
@@ -175,14 +175,9 @@ vc4_register_allocate(struct vc4_context *vc4, struct vc4_compile *c)
 {
         struct node_to_temp_map map[c->num_temps];
         uint32_t temp_to_node[c->num_temps];
-        uint32_t def[c->num_temps];
-        uint32_t use[c->num_temps];
         uint8_t class_bits[c->num_temps];
         struct qpu_reg *temp_registers = calloc(c->num_temps,
                                                 sizeof(*temp_registers));
-        for (int i = 0; i < ARRAY_SIZE(def); i++)
-                def[i] = ~0;
-        memset(use, 0, sizeof(use));
 
         /* If things aren't ever written (undefined values), just read from
          * r0.
@@ -195,38 +190,12 @@ vc4_register_allocate(struct vc4_context *vc4, struct vc4_compile *c)
         struct ra_graph *g = ra_alloc_interference_graph(vc4->regs,
                                                          c->num_temps);
 
-        /* Compute the live ranges so we can figure out interference.
-         */
-        uint32_t ip = 0;
-        qir_for_each_inst_inorder(inst, c) {
-                if (inst->dst.file == QFILE_TEMP) {
-                        def[inst->dst.index] = MIN2(ip, def[inst->dst.index]);
-                        use[inst->dst.index] = ip;
-                }
-
-                for (int i = 0; i < qir_get_op_nsrc(inst->op); i++) {
-                        if (inst->src[i].file == QFILE_TEMP)
-                                use[inst->src[i].index] = ip;
-                }
-
-                switch (inst->op) {
-                case QOP_FRAG_Z:
-                case QOP_FRAG_W:
-                        /* The payload registers have values implicitly loaded
-                         * at the start of the program.
-                         */
-                        def[inst->dst.index] = 0;
-                        break;
-                default:
-                        break;
-                }
-
-                ip++;
-        }
+        /* Compute the live ranges so we can figure out interference. */
+        qir_calculate_live_intervals(c);
 
         for (uint32_t i = 0; i < c->num_temps; i++) {
                 map[i].temp = i;
-                map[i].priority = use[i] - def[i];
+                map[i].priority = c->temp_end[i] - c->temp_start[i];
         }
         qsort(map, c->num_temps, sizeof(map[0]), node_to_temp_priority);
         for (uint32_t i = 0; i < c->num_temps; i++) {
@@ -241,7 +210,7 @@ vc4_register_allocate(struct vc4_context *vc4, struct vc4_compile *c)
                CLASS_BIT_A | CLASS_BIT_B_OR_ACC | CLASS_BIT_R4,
                sizeof(class_bits));
 
-        ip = 0;
+        int ip = 0;
         qir_for_each_inst_inorder(inst, c) {
                 if (qir_writes_r4(inst)) {
                         /* This instruction writes r4 (and optionally moves
@@ -249,7 +218,7 @@ vc4_register_allocate(struct vc4_context *vc4, struct vc4_compile *c)
                          * stored in r4 across it.
                          */
                         for (int i = 0; i < c->num_temps; i++) {
-                                if (def[i] < ip && use[i] > ip)
+                                if (c->temp_start[i] < ip && c->temp_end[i] > ip)
                                         class_bits[i] &= ~CLASS_BIT_R4;
                         }
                 } else {
@@ -328,7 +297,8 @@ vc4_register_allocate(struct vc4_context *vc4, struct vc4_compile *c)
 
         for (uint32_t i = 0; i < c->num_temps; i++) {
                 for (uint32_t j = i + 1; j < c->num_temps; j++) {
-                        if (!(def[i] >= use[j] || def[j] >= use[i])) {
+                        if (!(c->temp_start[i] >= c->temp_end[j] ||
+                              c->temp_start[j] >= c->temp_end[i])) {
                                 ra_add_node_interference(g,
                                                          temp_to_node[i],
                                                          temp_to_node[j]);
@@ -349,7 +319,7 @@ vc4_register_allocate(struct vc4_context *vc4, struct vc4_compile *c)
                 /* If the value's never used, just write to the NOP register
                  * for clarity in debug output.
                  */
-                if (def[i] == use[i])
+                if (c->temp_start[i] == c->temp_end[i])
                         temp_registers[i] = qpu_ra(QPU_W_NOP);
         }