The blob uses *both* nops and (ss). It turns out that in some rare cases
the hardware does take more than 6 cycles, at least for movmsk, but
adding nops is unnecessary. I believe the extra nops are only there due
to the immaturity of the blob's implementation of subgroup ops, so we
don't have to copy them - just handle shared reg producers the same as
SFU instructions.
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/14246>
static inline bool
is_ss_producer(struct ir3_instruction *instr)
{
+ foreach_dst (dst, instr) {
+ if (dst->flags & IR3_REG_SHARED)
+ return true;
+ }
return is_sfu(instr) || is_local_mem_load(instr);
}
* and so on. Not quite sure where it tapers out (ie. how many warps share an
* SFU unit). But 10 seems like a reasonable # to choose:
*/
- return 10;
+ if (is_sfu(instr) || is_local_mem_load(instr))
+ return 10;
+
+ /* The blob adds 6 nops between shared producers and consumers, and before we
+ * used (ss) this was sufficient in most cases.
+ */
+ return 6;
}
static inline bool
/* assigner must be alu: */
if (is_flow(consumer) || is_sfu(consumer) || is_tex(consumer) ||
- is_mem(consumer) || (assigner->dsts[0]->flags & IR3_REG_SHARED)) {
+ is_mem(consumer)) {
return 6;
} else {
/* In mergedregs mode, there is an extra 2-cycle penalty when half of
if (is_sfu(n))
regmask_set(&state->needs_ss, n->dsts[0]);
+ foreach_dst (dst, n) {
+ if (dst->flags & IR3_REG_SHARED)
+ regmask_set(&state->needs_ss, dst);
+ }
+
if (is_tex_or_prefetch(n)) {
regmask_set(&state->needs_sy, n->dsts[0]);
if (n->opc == OPC_META_TEX_PREFETCH)