From 89918c1e74e454af119e7ae23f3ed66fc26abc4b Mon Sep 17 00:00:00 2001 From: Eric Anholt Date: Thu, 10 Mar 2016 12:53:57 -0800 Subject: [PATCH] vc4: Implement live intervals using a CFG. Right now our CFG is always a trivial single basic block, but that will change when enable loops. --- src/gallium/drivers/vc4/Makefile.sources | 1 + src/gallium/drivers/vc4/vc4_qir.c | 47 ++++ src/gallium/drivers/vc4/vc4_qir.h | 16 +- src/gallium/drivers/vc4/vc4_qir_live_variables.c | 316 +++++++++++++++++++++++ src/gallium/drivers/vc4/vc4_qpu.h | 6 + src/gallium/drivers/vc4/vc4_register_allocate.c | 46 +--- 6 files changed, 393 insertions(+), 39 deletions(-) create mode 100644 src/gallium/drivers/vc4/vc4_qir_live_variables.c diff --git a/src/gallium/drivers/vc4/Makefile.sources b/src/gallium/drivers/vc4/Makefile.sources index 612a0a4..76e46f5 100644 --- a/src/gallium/drivers/vc4/Makefile.sources +++ b/src/gallium/drivers/vc4/Makefile.sources @@ -31,6 +31,7 @@ C_SOURCES := \ vc4_opt_vpm.c \ vc4_program.c \ vc4_qir.c \ + vc4_qir_live_variables.c \ vc4_qir_lower_uniforms.c \ vc4_qir_schedule.c \ vc4_qir_validate.c \ diff --git a/src/gallium/drivers/vc4/vc4_qir.c b/src/gallium/drivers/vc4/vc4_qir.c index c23b332..3f59fce 100644 --- a/src/gallium/drivers/vc4/vc4_qir.c +++ b/src/gallium/drivers/vc4/vc4_qir.c @@ -224,6 +224,53 @@ qir_writes_r4(struct qinst *inst) } } +uint8_t +qir_channels_written(struct qinst *inst) +{ + if (qir_is_mul(inst)) { + switch (inst->dst.pack) { + case QPU_PACK_MUL_NOP: + case QPU_PACK_MUL_8888: + return 0xf; + case QPU_PACK_MUL_8A: + return 0x1; + case QPU_PACK_MUL_8B: + return 0x2; + case QPU_PACK_MUL_8C: + return 0x4; + case QPU_PACK_MUL_8D: + return 0x8; + } + } else { + switch (inst->dst.pack) { + case QPU_PACK_A_NOP: + case QPU_PACK_A_8888: + case QPU_PACK_A_8888_SAT: + case QPU_PACK_A_32_SAT: + return 0xf; + case QPU_PACK_A_8A: + case QPU_PACK_A_8A_SAT: + return 0x1; + case QPU_PACK_A_8B: + case QPU_PACK_A_8B_SAT: + return 0x2; + case QPU_PACK_A_8C: + case QPU_PACK_A_8C_SAT: + return 0x4; + case QPU_PACK_A_8D: + case QPU_PACK_A_8D_SAT: + return 0x8; + case QPU_PACK_A_16A: + case QPU_PACK_A_16A_SAT: + return 0x3; + case QPU_PACK_A_16B: + case QPU_PACK_A_16B_SAT: + return 0xc; + } + } + unreachable("Bad pack field"); +} + static void qir_print_reg(struct vc4_compile *c, struct qreg reg, bool write) { diff --git a/src/gallium/drivers/vc4/vc4_qir.h b/src/gallium/drivers/vc4/vc4_qir.h index f3e6575..5099b7f 100644 --- a/src/gallium/drivers/vc4/vc4_qir.h +++ b/src/gallium/drivers/vc4/vc4_qir.h @@ -38,6 +38,7 @@ #include "vc4_screen.h" #include "vc4_qpu_defines.h" +#include "vc4_qpu.h" #include "kernel/vc4_packet.h" #include "pipe/p_state.h" @@ -353,6 +354,14 @@ struct qblock { struct qblock *successors[2]; int index; + + /** @{ used by vc4_qir_live_variables.c */ + BITSET_WORD *def; + BITSET_WORD *use; + BITSET_WORD *live_in; + BITSET_WORD *live_out; + int start_ip, end_ip; + /** @} */ }; struct vc4_compile { @@ -422,6 +431,9 @@ struct vc4_compile { struct vc4_fs_key *fs_key; struct vc4_vs_key *vs_key; + /* Live ranges of temps. */ + int *temp_start, *temp_end; + uint32_t *uniform_data; enum quniform_contents *uniform_contents; uint32_t uniform_array_size; @@ -488,6 +500,7 @@ struct qreg qir_emit_def(struct vc4_compile *c, struct qinst *inst); struct qinst *qir_emit_nondef(struct vc4_compile *c, struct qinst *inst); struct qreg qir_get_temp(struct vc4_compile *c); +void qir_calculate_live_intervals(struct vc4_compile *c); int qir_get_op_nsrc(enum qop qop); bool qir_reg_equals(struct qreg a, struct qreg b); bool qir_has_side_effects(struct vc4_compile *c, struct qinst *inst); @@ -499,6 +512,7 @@ bool qir_is_float_input(struct qinst *inst); bool qir_depends_on_flags(struct qinst *inst); bool qir_writes_r4(struct qinst *inst); struct qreg qir_follow_movs(struct vc4_compile *c, struct qreg reg); +uint8_t qir_channels_written(struct qinst *inst); void qir_dump(struct vc4_compile *c); void qir_dump_inst(struct vc4_compile *c, struct qinst *inst); @@ -667,7 +681,7 @@ qir_SEL(struct vc4_compile *c, uint8_t cond, struct qreg src0, struct qreg src1) struct qinst *a = qir_MOV_dest(c, t, src0); struct qinst *b = qir_MOV_dest(c, t, src1); a->cond = cond; - b->cond = cond ^ 1; + b->cond = qpu_cond_complement(cond); return t; } diff --git a/src/gallium/drivers/vc4/vc4_qir_live_variables.c b/src/gallium/drivers/vc4/vc4_qir_live_variables.c new file mode 100644 index 0000000..eac350a --- /dev/null +++ b/src/gallium/drivers/vc4/vc4_qir_live_variables.c @@ -0,0 +1,316 @@ +/* + * Copyright © 2012 Intel Corporation + * Copyright © 2016 Broadcom + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#define MAX_INSTRUCTION (1 << 30) + +#include "util/ralloc.h" +#include "util/register_allocate.h" +#include "vc4_context.h" +#include "vc4_qir.h" + +struct partial_update_state { + struct qinst *insts[4]; + uint8_t channels; +}; + +static uint32_t +int_hash(const void *key) +{ + return _mesa_hash_data(key, sizeof(int)); +} + +static bool +int_compare(const void *key1, const void *key2) +{ + return *(const int *)key1 == *(const int *)key2; +} + +static int +qir_reg_to_var(struct qreg reg) +{ + if (reg.file == QFILE_TEMP) + return reg.index; + + return -1; +} + +static void +qir_setup_use(struct vc4_compile *c, struct qblock *block, int ip, + struct qreg src) +{ + int var = qir_reg_to_var(src); + if (var == -1) + return; + + c->temp_start[var] = MIN2(c->temp_start[var], ip); + c->temp_end[var] = MAX2(c->temp_end[var], ip); + + /* The use[] bitset marks when the block makes + * use of a variable without having completely + * defined that variable within the block. + */ + if (!BITSET_TEST(block->def, var)) + BITSET_SET(block->use, var); +} + +static struct partial_update_state * +get_partial_update_state(struct hash_table *partial_update_ht, + struct qinst *inst) +{ + struct hash_entry *entry = + _mesa_hash_table_search(partial_update_ht, + &inst->dst.index); + if (entry) + return entry->data; + + struct partial_update_state *state = + rzalloc(partial_update_ht, struct partial_update_state); + + _mesa_hash_table_insert(partial_update_ht, &inst->dst.index, state); + + return state; +} + +static void +qir_setup_def(struct vc4_compile *c, struct qblock *block, int ip, + struct hash_table *partial_update_ht, struct qinst *inst) +{ + /* The def[] bitset marks when an initialization in a + * block completely screens off previous updates of + * that variable. + */ + int var = qir_reg_to_var(inst->dst); + if (var == -1) + return; + + c->temp_start[var] = MIN2(c->temp_start[var], ip); + c->temp_end[var] = MAX2(c->temp_end[var], ip); + + /* If we've already tracked this as a def, or already used it within + * the block, there's nothing to do. + */ + if (BITSET_TEST(block->use, var) || BITSET_TEST(block->def, var)) + return; + + /* Easy, common case: unconditional full register update. */ + if (inst->cond == QPU_COND_ALWAYS && !inst->dst.pack) { + BITSET_SET(block->def, var); + return; + } + + /* Finally, look at the condition code and packing and mark it as a + * def. We need to make sure that we understand sequences + * instructions like: + * + * mov.zs t0, t1 + * mov.zc t0, t2 + * + * or: + * + * mmov t0.8a, t1 + * mmov t0.8b, t2 + * mmov t0.8c, t3 + * mmov t0.8d, t4 + * + * as defining the temp within the block, because otherwise dst's live + * range will get extended up the control flow to the top of the + * program. + */ + struct partial_update_state *state = + get_partial_update_state(partial_update_ht, inst); + uint8_t mask = qir_channels_written(inst); + + if (inst->cond == QPU_COND_ALWAYS) { + state->channels |= mask; + } else { + for (int i = 0; i < 4; i++) { + if (!(mask & (1 << i))) + continue; + + if (state->insts[i] && + state->insts[i]->cond == + qpu_cond_complement(inst->cond)) + state->channels |= 1 << i; + else + state->insts[i] = inst; + } + } + + if (state->channels == 0xf) + BITSET_SET(block->def, var); +} + +static void +sf_state_clear(struct hash_table *partial_update_ht) +{ + struct hash_entry *entry; + + hash_table_foreach(partial_update_ht, entry) { + struct partial_update_state *state = entry->data; + + for (int i = 0; i < 4; i++) { + if (state->insts[i] && state->insts[i]->cond) + state->insts[i] = NULL; + } + } +} + +/* Sets up the def/use arrays for when variables are used-before-defined or + * defined-before-used in the block. + * + * Also initializes the temp_start/temp_end to cover just the instruction IPs + * where the variable is used, which will be extended later in + * qir_compute_start_end(). + */ +static void +qir_setup_def_use(struct vc4_compile *c) +{ + struct hash_table *partial_update_ht = + _mesa_hash_table_create(c, int_hash, int_compare); + int ip = 0; + + qir_for_each_block(block, c) { + block->start_ip = ip; + + _mesa_hash_table_clear(partial_update_ht, NULL); + + qir_for_each_inst(inst, block) { + for (int i = 0; i < qir_get_op_nsrc(inst->op); i++) + qir_setup_use(c, block, ip, inst->src[i]); + + qir_setup_def(c, block, ip, partial_update_ht, inst); + + if (inst->sf) + sf_state_clear(partial_update_ht); + + switch (inst->op) { + case QOP_FRAG_Z: + case QOP_FRAG_W: + /* The payload registers have values + * implicitly loaded at the start of the + * program. + */ + if (inst->dst.file == QFILE_TEMP) + c->temp_start[inst->dst.index] = 0; + break; + default: + break; + } + ip++; + } + block->end_ip = ip; + } + + _mesa_hash_table_destroy(partial_update_ht, NULL); +} + +static bool +qir_live_variables_dataflow(struct vc4_compile *c, int bitset_words) +{ + bool cont = false; + + qir_for_each_block_rev(block, c) { + /* Update live_out: Any successor using the variable + * on entrance needs us to have the variable live on + * exit. + */ + qir_for_each_successor(succ, block) { + for (int i = 0; i < bitset_words; i++) { + BITSET_WORD new_live_out = (succ->live_in[i] & + ~block->live_out[i]); + if (new_live_out) { + block->live_out[i] |= new_live_out; + cont = true; + } + } + } + + /* Update live_in */ + for (int i = 0; i < bitset_words; i++) { + BITSET_WORD new_live_in = (block->use[i] | + (block->live_out[i] & + ~block->def[i])); + if (new_live_in & ~block->live_in[i]) { + block->live_in[i] |= new_live_in; + cont = true; + } + } + } + + return cont; +} + +/** + * Extend the start/end ranges for each variable to account for the + * new information calculated from control flow. + */ +static void +qir_compute_start_end(struct vc4_compile *c, int num_vars) +{ + qir_for_each_block(block, c) { + for (int i = 0; i < num_vars; i++) { + if (BITSET_TEST(block->live_in, i)) { + c->temp_start[i] = MIN2(c->temp_start[i], + block->start_ip); + c->temp_end[i] = MAX2(c->temp_end[i], + block->start_ip); + } + + if (BITSET_TEST(block->live_out, i)) { + c->temp_start[i] = MIN2(c->temp_start[i], + block->end_ip); + c->temp_end[i] = MAX2(c->temp_end[i], + block->end_ip); + } + } + } +} + +void +qir_calculate_live_intervals(struct vc4_compile *c) +{ + int bitset_words = BITSET_WORDS(c->num_temps); + + c->temp_start = reralloc(c, c->temp_start, int, c->num_temps); + c->temp_end = reralloc(c, c->temp_end, int, c->num_temps); + + for (int i = 0; i < c->num_temps; i++) { + c->temp_start[i] = MAX_INSTRUCTION; + c->temp_end[i] = -1; + } + + qir_for_each_block(block, c) { + block->def = reralloc(c, block->def, BITSET_WORD, bitset_words); + block->use = reralloc(c, block->use, BITSET_WORD, bitset_words); + block->live_in = reralloc(c, block->live_in, BITSET_WORD, bitset_words); + block->live_out = reralloc(c, block->live_out, BITSET_WORD, bitset_words); + } + + qir_setup_def_use(c); + + while (qir_live_variables_dataflow(c, bitset_words)) + ; + + qir_compute_start_end(c, c->num_temps); +} diff --git a/src/gallium/drivers/vc4/vc4_qpu.h b/src/gallium/drivers/vc4/vc4_qpu.h index b7dab16..83fa36e 100644 --- a/src/gallium/drivers/vc4/vc4_qpu.h +++ b/src/gallium/drivers/vc4/vc4_qpu.h @@ -153,6 +153,12 @@ bool qpu_inst_is_tlb(uint64_t inst) ATTRIBUTE_CONST; int qpu_num_sf_accesses(uint64_t inst) ATTRIBUTE_CONST; void qpu_serialize_one_inst(struct vc4_compile *c, uint64_t inst); +static inline enum qpu_cond +qpu_cond_complement(enum qpu_cond cond) +{ + return cond ^ 1; +} + static inline uint64_t qpu_load_imm_f(struct qpu_reg dst, float val) { diff --git a/src/gallium/drivers/vc4/vc4_register_allocate.c b/src/gallium/drivers/vc4/vc4_register_allocate.c index bb5a396..203b459 100644 --- a/src/gallium/drivers/vc4/vc4_register_allocate.c +++ b/src/gallium/drivers/vc4/vc4_register_allocate.c @@ -175,14 +175,9 @@ vc4_register_allocate(struct vc4_context *vc4, struct vc4_compile *c) { struct node_to_temp_map map[c->num_temps]; uint32_t temp_to_node[c->num_temps]; - uint32_t def[c->num_temps]; - uint32_t use[c->num_temps]; uint8_t class_bits[c->num_temps]; struct qpu_reg *temp_registers = calloc(c->num_temps, sizeof(*temp_registers)); - for (int i = 0; i < ARRAY_SIZE(def); i++) - def[i] = ~0; - memset(use, 0, sizeof(use)); /* If things aren't ever written (undefined values), just read from * r0. @@ -195,38 +190,12 @@ vc4_register_allocate(struct vc4_context *vc4, struct vc4_compile *c) struct ra_graph *g = ra_alloc_interference_graph(vc4->regs, c->num_temps); - /* Compute the live ranges so we can figure out interference. - */ - uint32_t ip = 0; - qir_for_each_inst_inorder(inst, c) { - if (inst->dst.file == QFILE_TEMP) { - def[inst->dst.index] = MIN2(ip, def[inst->dst.index]); - use[inst->dst.index] = ip; - } - - for (int i = 0; i < qir_get_op_nsrc(inst->op); i++) { - if (inst->src[i].file == QFILE_TEMP) - use[inst->src[i].index] = ip; - } - - switch (inst->op) { - case QOP_FRAG_Z: - case QOP_FRAG_W: - /* The payload registers have values implicitly loaded - * at the start of the program. - */ - def[inst->dst.index] = 0; - break; - default: - break; - } - - ip++; - } + /* Compute the live ranges so we can figure out interference. */ + qir_calculate_live_intervals(c); for (uint32_t i = 0; i < c->num_temps; i++) { map[i].temp = i; - map[i].priority = use[i] - def[i]; + map[i].priority = c->temp_end[i] - c->temp_start[i]; } qsort(map, c->num_temps, sizeof(map[0]), node_to_temp_priority); for (uint32_t i = 0; i < c->num_temps; i++) { @@ -241,7 +210,7 @@ vc4_register_allocate(struct vc4_context *vc4, struct vc4_compile *c) CLASS_BIT_A | CLASS_BIT_B_OR_ACC | CLASS_BIT_R4, sizeof(class_bits)); - ip = 0; + int ip = 0; qir_for_each_inst_inorder(inst, c) { if (qir_writes_r4(inst)) { /* This instruction writes r4 (and optionally moves @@ -249,7 +218,7 @@ vc4_register_allocate(struct vc4_context *vc4, struct vc4_compile *c) * stored in r4 across it. */ for (int i = 0; i < c->num_temps; i++) { - if (def[i] < ip && use[i] > ip) + if (c->temp_start[i] < ip && c->temp_end[i] > ip) class_bits[i] &= ~CLASS_BIT_R4; } } else { @@ -328,7 +297,8 @@ vc4_register_allocate(struct vc4_context *vc4, struct vc4_compile *c) for (uint32_t i = 0; i < c->num_temps; i++) { for (uint32_t j = i + 1; j < c->num_temps; j++) { - if (!(def[i] >= use[j] || def[j] >= use[i])) { + if (!(c->temp_start[i] >= c->temp_end[j] || + c->temp_start[j] >= c->temp_end[i])) { ra_add_node_interference(g, temp_to_node[i], temp_to_node[j]); @@ -349,7 +319,7 @@ vc4_register_allocate(struct vc4_context *vc4, struct vc4_compile *c) /* If the value's never used, just write to the NOP register * for clarity in debug output. */ - if (def[i] == use[i]) + if (c->temp_start[i] == c->temp_end[i]) temp_registers[i] = qpu_ra(QPU_W_NOP); } -- 2.7.4