From cb45120556938070a623b056867545301bce8cac Mon Sep 17 00:00:00 2001 From: Connor Abbott Date: Fri, 17 Dec 2021 19:48:49 +0100 Subject: [PATCH] ir3: Use (ss) for instructions writing shared regs The blob uses *both* nops and (ss). It turns out that in some rare cases the hardware does take more than 6 cycles, at least for movmsk, but adding nops is unnecessary. I believe the extra nops are only there due to the immaturity of the blob's implementation of subgroup ops, so we don't have to copy them - just handle shared reg producers the same as SFU instructions. Part-of: --- src/freedreno/ir3/ir3.h | 12 +++++++++++- src/freedreno/ir3/ir3_delay.c | 2 +- src/freedreno/ir3/ir3_legalize.c | 5 +++++ 3 files changed, 17 insertions(+), 2 deletions(-) diff --git a/src/freedreno/ir3/ir3.h b/src/freedreno/ir3/ir3.h index e1d6399..932f85a 100644 --- a/src/freedreno/ir3/ir3.h +++ b/src/freedreno/ir3/ir3.h @@ -1670,6 +1670,10 @@ is_local_mem_load(struct ir3_instruction *instr) static inline bool is_ss_producer(struct ir3_instruction *instr) { + foreach_dst (dst, instr) { + if (dst->flags & IR3_REG_SHARED) + return true; + } return is_sfu(instr) || is_local_mem_load(instr); } @@ -1687,7 +1691,13 @@ soft_ss_delay(struct ir3_instruction *instr) * and so on. Not quite sure where it tapers out (ie. how many warps share an * SFU unit). But 10 seems like a reasonable # to choose: */ - return 10; + if (is_sfu(instr) || is_local_mem_load(instr)) + return 10; + + /* The blob adds 6 nops between shared producers and consumers, and before we + * used (ss) this was sufficient in most cases. + */ + return 6; } static inline bool diff --git a/src/freedreno/ir3/ir3_delay.c b/src/freedreno/ir3/ir3_delay.c index 83730b5..054f4c8 100644 --- a/src/freedreno/ir3/ir3_delay.c +++ b/src/freedreno/ir3/ir3_delay.c @@ -76,7 +76,7 @@ ir3_delayslots(struct ir3_instruction *assigner, /* assigner must be alu: */ if (is_flow(consumer) || is_sfu(consumer) || is_tex(consumer) || - is_mem(consumer) || (assigner->dsts[0]->flags & IR3_REG_SHARED)) { + is_mem(consumer)) { return 6; } else { /* In mergedregs mode, there is an extra 2-cycle penalty when half of diff --git a/src/freedreno/ir3/ir3_legalize.c b/src/freedreno/ir3/ir3_legalize.c index bf8906f..f46312d 100644 --- a/src/freedreno/ir3/ir3_legalize.c +++ b/src/freedreno/ir3/ir3_legalize.c @@ -255,6 +255,11 @@ legalize_block(struct ir3_legalize_ctx *ctx, struct ir3_block *block) if (is_sfu(n)) regmask_set(&state->needs_ss, n->dsts[0]); + foreach_dst (dst, n) { + if (dst->flags & IR3_REG_SHARED) + regmask_set(&state->needs_ss, dst); + } + if (is_tex_or_prefetch(n)) { regmask_set(&state->needs_sy, n->dsts[0]); if (n->opc == OPC_META_TEX_PREFETCH) -- 2.7.4