From 44df061aaad96fc5db630ae69fb2fe2a03bb5659 Mon Sep 17 00:00:00 2001 From: Eric Anholt Date: Wed, 27 Apr 2016 12:14:07 -0700 Subject: [PATCH] vc4: Add support for scheduling of branch instructions. For now we don't fill the delay slots, and instead just drop in NOPs. --- src/gallium/drivers/vc4/vc4_qir.h | 11 +++ src/gallium/drivers/vc4/vc4_qpu_schedule.c | 120 +++++++++++++++++++++++++---- 2 files changed, 114 insertions(+), 17 deletions(-) diff --git a/src/gallium/drivers/vc4/vc4_qir.h b/src/gallium/drivers/vc4/vc4_qir.h index fabdf10..e7ddfaa 100644 --- a/src/gallium/drivers/vc4/vc4_qir.h +++ b/src/gallium/drivers/vc4/vc4_qir.h @@ -362,6 +362,17 @@ struct qblock { int index; + /* Instruction IPs for the first and last instruction of the block. + * Set by vc4_qpu_schedule.c. + */ + uint32_t start_qpu_ip; + uint32_t end_qpu_ip; + + /* Instruction IP for the branch instruction of the block. Set by + * vc4_qpu_schedule.c. + */ + uint32_t branch_qpu_ip; + /** @{ used by vc4_qir_live_variables.c */ BITSET_WORD *def; BITSET_WORD *use; diff --git a/src/gallium/drivers/vc4/vc4_qpu_schedule.c b/src/gallium/drivers/vc4/vc4_qpu_schedule.c index fad10e5..a55b035 100644 --- a/src/gallium/drivers/vc4/vc4_qpu_schedule.c +++ b/src/gallium/drivers/vc4/vc4_qpu_schedule.c @@ -354,7 +354,8 @@ calculate_deps(struct schedule_state *state, struct schedule_node *n) if (sig != QPU_SIG_LOAD_IMM) { process_raddr_deps(state, n, raddr_a, true); - if (sig != QPU_SIG_SMALL_IMM) + if (sig != QPU_SIG_SMALL_IMM && + sig != QPU_SIG_BRANCH) process_raddr_deps(state, n, raddr_b, false); } @@ -392,20 +393,23 @@ calculate_deps(struct schedule_state *state, struct schedule_node *n) add_read_dep(state, state->last_tlb, n); break; + case QPU_SIG_BRANCH: + add_read_dep(state, state->last_sf, n); + break; + case QPU_SIG_PROG_END: case QPU_SIG_WAIT_FOR_SCOREBOARD: case QPU_SIG_SCOREBOARD_UNLOCK: case QPU_SIG_COVERAGE_LOAD: case QPU_SIG_COLOR_LOAD_END: case QPU_SIG_ALPHA_MASK_LOAD: - case QPU_SIG_BRANCH: fprintf(stderr, "Unhandled signal bits %d\n", sig); abort(); } process_cond_deps(state, n, QPU_GET_FIELD(inst, QPU_COND_ADD)); process_cond_deps(state, n, QPU_GET_FIELD(inst, QPU_COND_MUL)); - if (inst & QPU_SF) + if ((inst & QPU_SF) && sig != QPU_SIG_BRANCH) add_write_dep(state, &state->last_sf, n); } @@ -525,6 +529,16 @@ choose_instruction_to_schedule(struct choose_scoreboard *scoreboard, list_for_each_entry(struct schedule_node, n, schedule_list, link) { uint64_t inst = n->inst->inst; + /* Don't choose the branch instruction until it's the last one + * left. XXX: We could potentially choose it before it's the + * last one, if the remaining instructions fit in the delay + * slots. + */ + if (QPU_GET_FIELD(inst, QPU_SIG) == QPU_SIG_BRANCH && + !list_is_singular(schedule_list)) { + continue; + } + /* "An instruction must not read from a location in physical * regfile A or B that was written to by the previous * instruction." @@ -722,19 +736,16 @@ mark_instruction_scheduled(struct list_head *schedule_list, } static uint32_t -schedule_instructions(struct vc4_compile *c, struct list_head *schedule_list, +schedule_instructions(struct vc4_compile *c, + struct choose_scoreboard *scoreboard, + struct qblock *block, + struct list_head *schedule_list, enum quniform_contents *orig_uniform_contents, uint32_t *orig_uniform_data, uint32_t *next_uniform) { - struct choose_scoreboard scoreboard; uint32_t time = 0; - memset(&scoreboard, 0, sizeof(scoreboard)); - scoreboard.last_waddr_a = ~0; - scoreboard.last_waddr_b = ~0; - scoreboard.last_sfu_write_tick = -10; - if (debug) { fprintf(stderr, "initial deps:\n"); dump_state(schedule_list); @@ -749,7 +760,7 @@ schedule_instructions(struct vc4_compile *c, struct list_head *schedule_list, while (!list_empty(schedule_list)) { struct schedule_node *chosen = - choose_instruction_to_schedule(&scoreboard, + choose_instruction_to_schedule(scoreboard, schedule_list, NULL); struct schedule_node *merge = NULL; @@ -784,7 +795,7 @@ schedule_instructions(struct vc4_compile *c, struct list_head *schedule_list, (*next_uniform)++; } - merge = choose_instruction_to_schedule(&scoreboard, + merge = choose_instruction_to_schedule(scoreboard, schedule_list, chosen); if (merge) { @@ -818,7 +829,7 @@ schedule_instructions(struct vc4_compile *c, struct list_head *schedule_list, qpu_serialize_one_inst(c, inst); - update_scoreboard_for_chosen(&scoreboard, inst); + update_scoreboard_for_chosen(scoreboard, inst); /* Now that we've scheduled a new instruction, some of its * children can be promoted to the list of instructions ready to @@ -828,15 +839,34 @@ schedule_instructions(struct vc4_compile *c, struct list_head *schedule_list, mark_instruction_scheduled(schedule_list, time, chosen, false); mark_instruction_scheduled(schedule_list, time, merge, false); - scoreboard.tick++; + scoreboard->tick++; time++; + + if (QPU_GET_FIELD(inst, QPU_SIG) == QPU_SIG_BRANCH) { + block->branch_qpu_ip = c->qpu_inst_count - 1; + /* Fill the delay slots. + * + * We should fill these with actual instructions, + * instead, but that will probably need to be done + * after this, once we know what the leading + * instructions of the successors are (so we can + * handle A/B register file write latency) + */ + inst = qpu_NOP(); + update_scoreboard_for_chosen(scoreboard, inst); + qpu_serialize_one_inst(c, inst); + qpu_serialize_one_inst(c, inst); + qpu_serialize_one_inst(c, inst); + } } return time; } static uint32_t -qpu_schedule_instructions_block(struct vc4_compile *c, struct qblock *block, +qpu_schedule_instructions_block(struct vc4_compile *c, + struct choose_scoreboard *scoreboard, + struct qblock *block, enum quniform_contents *orig_uniform_contents, uint32_t *orig_uniform_data, uint32_t *next_uniform) @@ -871,7 +901,8 @@ qpu_schedule_instructions_block(struct vc4_compile *c, struct qblock *block, compute_delay(n); } - uint32_t cycles = schedule_instructions(c, &schedule_list, + uint32_t cycles = schedule_instructions(c, scoreboard, block, + &schedule_list, orig_uniform_contents, orig_uniform_data, next_uniform); @@ -881,6 +912,46 @@ qpu_schedule_instructions_block(struct vc4_compile *c, struct qblock *block, return cycles; } +static void +qpu_set_branch_targets(struct vc4_compile *c) +{ + qir_for_each_block(block, c) { + /* The end block of the program has no branch. */ + if (!block->successors[0]) + continue; + + /* If there was no branch instruction, then the successor + * block must follow immediately after this one. + */ + if (block->branch_qpu_ip == ~0) { + assert(block->end_qpu_ip + 1 == + block->successors[0]->start_qpu_ip); + continue; + } + + /* Set the branch target for the block that doesn't follow + * immediately after ours. + */ + uint64_t *branch_inst = &c->qpu_insts[block->branch_qpu_ip]; + assert(QPU_GET_FIELD(*branch_inst, QPU_SIG) == QPU_SIG_BRANCH); + assert(QPU_GET_FIELD(*branch_inst, QPU_BRANCH_TARGET) == 0); + + uint32_t branch_target = + (block->successors[0]->start_qpu_ip - + (block->branch_qpu_ip + 4)) * sizeof(uint64_t); + *branch_inst = (*branch_inst | + QPU_SET_FIELD(branch_target, QPU_BRANCH_TARGET)); + + /* Make sure that the if-we-don't-jump successor was scheduled + * just after the delay slots. + */ + if (block->successors[1]) { + assert(block->successors[1]->start_qpu_ip == + block->branch_qpu_ip + 4); + } + } +} + uint32_t qpu_schedule_instructions(struct vc4_compile *c) { @@ -895,6 +966,12 @@ qpu_schedule_instructions(struct vc4_compile *c) c->uniform_array_size = c->num_uniforms; uint32_t next_uniform = 0; + struct choose_scoreboard scoreboard; + memset(&scoreboard, 0, sizeof(scoreboard)); + scoreboard.last_waddr_a = ~0; + scoreboard.last_waddr_b = ~0; + scoreboard.last_sfu_write_tick = -10; + if (debug) { fprintf(stderr, "Pre-schedule instructions\n"); qir_for_each_block(block, c) { @@ -910,12 +987,21 @@ qpu_schedule_instructions(struct vc4_compile *c) uint32_t cycles = 0; qir_for_each_block(block, c) { - cycles += qpu_schedule_instructions_block(c, block, + block->start_qpu_ip = c->qpu_inst_count; + block->branch_qpu_ip = ~0; + + cycles += qpu_schedule_instructions_block(c, + &scoreboard, + block, uniform_contents, uniform_data, &next_uniform); + + block->end_qpu_ip = c->qpu_inst_count - 1; } + qpu_set_branch_targets(c); + assert(next_uniform == c->num_uniforms); if (debug) { -- 2.7.4