From 8c47640731303ed2607d28ce2cf19a7e8f0f4006 Mon Sep 17 00:00:00 2001 From: Erico Nunes Date: Tue, 14 Apr 2020 02:45:38 +0200 Subject: [PATCH] lima/ppir: rework store output In many cases, it is possible to avoid creating a mov for the store output node. Additionally, nodes other than alu, such as load varying, can be valid store output nodes too. This is another small optimization, but helps a vast majority of programs by 1 instruction. Shaders with discard easily become complicated to handle properly. Some example issues: ppir has to rely on instruction ordering; or a node with ssa output could be required both before a discard_if (as a condition) and after it (as the instruction with the 'stop' bit set). So don't try to handle them here. Signed-off-by: Erico Nunes Reviewed-by: Vasily Khoruzhick Part-of: --- src/gallium/drivers/lima/ir/pp/codegen.c | 2 -- src/gallium/drivers/lima/ir/pp/liveness.c | 1 - src/gallium/drivers/lima/ir/pp/nir.c | 34 +++++++++++++++++++++----- src/gallium/drivers/lima/ir/pp/node.c | 13 ++++------ src/gallium/drivers/lima/ir/pp/node_to_instr.c | 13 +++------- src/gallium/drivers/lima/ir/pp/ppir.h | 3 ++- src/gallium/drivers/lima/ir/pp/regalloc.c | 2 +- 7 files changed, 40 insertions(+), 28 deletions(-) diff --git a/src/gallium/drivers/lima/ir/pp/codegen.c b/src/gallium/drivers/lima/ir/pp/codegen.c index ce2c107..4420ee3 100644 --- a/src/gallium/drivers/lima/ir/pp/codegen.c +++ b/src/gallium/drivers/lima/ir/pp/codegen.c @@ -201,7 +201,6 @@ static void ppir_codegen_encode_vec_mul(ppir_node *node, void *code) f->op = shift_to_op(alu->shift); break; case ppir_op_mov: - case ppir_op_store_color: f->op = ppir_codegen_vec4_mul_op_mov; break; case ppir_op_max: @@ -344,7 +343,6 @@ static void ppir_codegen_encode_vec_add(ppir_node *node, void *code) f->op = ppir_codegen_vec4_acc_op_add; break; case ppir_op_mov: - case ppir_op_store_color: f->op = ppir_codegen_vec4_acc_op_mov; break; case ppir_op_sum3: diff --git a/src/gallium/drivers/lima/ir/pp/liveness.c b/src/gallium/drivers/lima/ir/pp/liveness.c index b17564b..20954bd 100644 --- a/src/gallium/drivers/lima/ir/pp/liveness.c +++ b/src/gallium/drivers/lima/ir/pp/liveness.c @@ -178,7 +178,6 @@ ppir_liveness_instr_dest(ppir_compiler *comp, ppir_instr *instr) switch(node->op) { case ppir_op_const: case ppir_op_undef: - case ppir_op_store_color: /* never clear dest if its store output */ continue; default: break; diff --git a/src/gallium/drivers/lima/ir/pp/nir.c b/src/gallium/drivers/lima/ir/pp/nir.c index 3c02c0a..620ecbe 100644 --- a/src/gallium/drivers/lima/ir/pp/nir.c +++ b/src/gallium/drivers/lima/ir/pp/nir.c @@ -352,7 +352,26 @@ static bool ppir_emit_intrinsic(ppir_block *block, nir_instr *ni) return true; case nir_intrinsic_store_output: { - alu_node = ppir_node_create_dest(block, ppir_op_store_color, NULL, 0); + /* In simple cases where the store_output is ssa, that register + * can be directly marked as the output. + * If discard is used or the source is not ssa, things can get a + * lot more complicated, so don't try to optimize those and fall + * back to inserting a mov at the end. + * If the source node will only be able to output to pipeline + * registers, fall back to the mov as well. */ + if (!block->comp->uses_discard && instr->src->is_ssa) { + node = block->comp->var_nodes[instr->src->ssa->index]; + switch (node->op) { + case ppir_op_load_uniform: + case ppir_op_const: + break; + default: + node->is_end = 1; + return true; + } + } + + alu_node = ppir_node_create_dest(block, ppir_op_mov, NULL, 0); if (!alu_node) return false; @@ -370,6 +389,8 @@ static bool ppir_emit_intrinsic(ppir_block *block, nir_instr *ni) ppir_node_add_src(block->comp, &alu_node->node, alu_node->src, instr->src, u_bit_consecutive(0, instr->num_components)); + alu_node->node.is_end = 1; + list_addtail(&alu_node->node.list, &block->node_list); return true; } @@ -724,9 +745,9 @@ static ppir_compiler *ppir_compiler_create(void *prog, unsigned num_reg, unsigne static void ppir_add_ordering_deps(ppir_compiler *comp) { /* Some intrinsics do not have explicit dependencies and thus depend - * on instructions order. Consider discard_if and store_ouput as - * example. If we don't add fake dependency of discard_if to store_output - * scheduler may put store_output first and since store_output terminates + * on instructions order. Consider discard_if and the is_end node as + * example. If we don't add fake dependency of discard_if to is_end, + * scheduler may put the is_end first and since is_end terminates * shader on Utgard PP, rest of it will never be executed. * Add fake dependencies for discard/branch/store to preserve * instruction order. @@ -753,8 +774,8 @@ static void ppir_add_ordering_deps(ppir_compiler *comp) if (prev_node && ppir_node_is_root(node) && node->op != ppir_op_const) { ppir_node_add_dep(prev_node, node, ppir_dep_sequence); } - if (node->op == ppir_op_discard || - node->op == ppir_op_store_color || + if (node->is_end || + node->op == ppir_op_discard || node->op == ppir_op_store_temp || node->op == ppir_op_branch) { prev_node = node; @@ -818,6 +839,7 @@ bool ppir_compile_nir(struct lima_fs_shader_state *prog, struct nir_shader *nir, return false; comp->ra = ra; + comp->uses_discard = nir->info.fs.uses_discard; /* 1st pass: create ppir blocks */ nir_foreach_function(function, nir) { diff --git a/src/gallium/drivers/lima/ir/pp/node.c b/src/gallium/drivers/lima/ir/pp/node.c index fdf444a..b71cbc3 100644 --- a/src/gallium/drivers/lima/ir/pp/node.c +++ b/src/gallium/drivers/lima/ir/pp/node.c @@ -313,14 +313,6 @@ const ppir_op_info ppir_op_infos[] = { .name = "const", .type = ppir_node_type_const, }, - [ppir_op_store_color] = { - .name = "st_col", - .type = ppir_node_type_alu, - .slots = (int []) { - PPIR_INSTR_SLOT_ALU_VEC_ADD, PPIR_INSTR_SLOT_ALU_VEC_MUL, - PPIR_INSTR_SLOT_END - }, - }, [ppir_op_store_temp] = { .name = "st_temp", .type = ppir_node_type_store, @@ -632,6 +624,11 @@ ppir_node *ppir_node_insert_mov(ppir_node *node) ppir_node_add_dep(move, node, ppir_dep_src); list_addtail(&move->list, &node->list); + if (node->is_end) { + node->is_end = false; + move->is_end = true; + } + return move; } diff --git a/src/gallium/drivers/lima/ir/pp/node_to_instr.c b/src/gallium/drivers/lima/ir/pp/node_to_instr.c index 35cf5f1..14b8c41 100644 --- a/src/gallium/drivers/lima/ir/pp/node_to_instr.c +++ b/src/gallium/drivers/lima/ir/pp/node_to_instr.c @@ -75,7 +75,7 @@ static bool ppir_do_node_to_instr_try_insert(ppir_block *block, ppir_node *node) return ppir_instr_insert_node(succ->instr, node); } -static bool ppir_do_one_node_to_instr(ppir_block *block, ppir_node *node, ppir_node **next) +static bool ppir_do_one_node_to_instr(ppir_block *block, ppir_node *node) { switch (node->type) { case ppir_node_type_alu: @@ -105,9 +105,6 @@ static bool ppir_do_one_node_to_instr(ppir_block *block, ppir_node *node, ppir_n if (!node->instr && !create_new_instr(block, node)) return false; - if (node->op == ppir_op_store_color) - node->instr->is_end = true; - break; } case ppir_node_type_load: @@ -195,15 +192,13 @@ static bool ppir_do_one_node_to_instr(ppir_block *block, ppir_node *node, ppir_n static bool ppir_do_node_to_instr(ppir_block *block, ppir_node *node) { - ppir_node *next = node; - /* first try pipeline sched, if that didn't succeed try normal scheduling */ if (!ppir_do_node_to_instr_try_insert(block, node)) - if (!ppir_do_one_node_to_instr(block, node, &next)) + if (!ppir_do_one_node_to_instr(block, node)) return false; - /* next may have been updated in ppir_do_one_node_to_instr */ - node = next; + if (node->is_end) + node->instr->is_end = true; /* we have to make sure the dep not be destroyed (due to * succ change) in ppir_do_node_to_instr, otherwise we can't diff --git a/src/gallium/drivers/lima/ir/pp/ppir.h b/src/gallium/drivers/lima/ir/pp/ppir.h index cec918d..59887a7 100644 --- a/src/gallium/drivers/lima/ir/pp/ppir.h +++ b/src/gallium/drivers/lima/ir/pp/ppir.h @@ -108,7 +108,6 @@ typedef enum { ppir_op_load_temp, ppir_op_store_temp, - ppir_op_store_color, ppir_op_const, @@ -162,6 +161,7 @@ typedef struct ppir_node { struct ppir_instr *instr; int instr_pos; struct ppir_block *block; + bool is_end; /* for scheduler */ struct list_head succ_list; @@ -385,6 +385,7 @@ typedef struct ppir_compiler { struct ra_regs *ra; struct lima_fs_shader_state *prog; + bool uses_discard; /* for scheduler */ int sched_instr_base; diff --git a/src/gallium/drivers/lima/ir/pp/regalloc.c b/src/gallium/drivers/lima/ir/pp/regalloc.c index 4b5d3a8..5c5d13e 100644 --- a/src/gallium/drivers/lima/ir/pp/regalloc.c +++ b/src/gallium/drivers/lima/ir/pp/regalloc.c @@ -138,7 +138,7 @@ static void ppir_regalloc_update_reglist_ssa(ppir_compiler *comp) { list_for_each_entry(ppir_block, block, &comp->block_list, list) { list_for_each_entry(ppir_node, node, &block->node_list, list) { - if (node->op == ppir_op_store_color) + if (node->is_end) continue; if (!node->instr || node->op == ppir_op_const) -- 2.7.4