ir3: Add support for subgroup arithmetic
authorConnor Abbott <cwabbott0@gmail.com>
Tue, 7 Dec 2021 11:11:05 +0000 (12:11 +0100)
committerMarge Bot <emma+marge@anholt.net>
Thu, 10 Mar 2022 17:15:29 +0000 (17:15 +0000)
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/14107>

src/freedreno/ir3/disasm-a3xx.c
src/freedreno/ir3/instr-a3xx.h
src/freedreno/ir3/ir3.h
src/freedreno/ir3/ir3_compiler_nir.c
src/freedreno/ir3/ir3_lower_subgroups.c
src/freedreno/ir3/ir3_print.c
src/freedreno/ir3/ir3_validate.c

index c0feeb1..e9f29b6 100644 (file)
@@ -192,6 +192,7 @@ static const struct opc_info {
    OPC(1, OPC_READ_COND_MACRO, read_cond.macro),
    OPC(1, OPC_READ_FIRST_MACRO, read_first.macro),
    OPC(1, OPC_SWZ_SHARED_MACRO, swz_shared.macro),
+   OPC(1, OPC_SCAN_MACRO, scan.macro),
 
    /* category 2: */
    OPC(2, OPC_ADD_F,        add.f),
index 0693248..f1f41de 100644 (file)
@@ -127,6 +127,9 @@ typedef enum {
    OPC_READ_FIRST_MACRO = _OPC(1, 55),
    OPC_SWZ_SHARED_MACRO = _OPC(1, 56),
 
+   /* Macros that expand to a loop */
+   OPC_SCAN_MACRO      = _OPC(1, 57),
+
    /* category 2: */
    OPC_ADD_F           = _OPC(2, 0),
    OPC_MIN_F           = _OPC(2, 1),
index c71e8da..25de40d 100644 (file)
@@ -239,6 +239,22 @@ struct ir3_register {
       arr[arr##_count++] = __VA_ARGS__;                                        \
    } while (0)
 
+typedef enum {
+   REDUCE_OP_ADD_U,
+   REDUCE_OP_ADD_F,
+   REDUCE_OP_MUL_U,
+   REDUCE_OP_MUL_F,
+   REDUCE_OP_MIN_U,
+   REDUCE_OP_MIN_S,
+   REDUCE_OP_MIN_F,
+   REDUCE_OP_MAX_U,
+   REDUCE_OP_MAX_S,
+   REDUCE_OP_MAX_F,
+   REDUCE_OP_AND_B,
+   REDUCE_OP_OR_B,
+   REDUCE_OP_XOR_B,
+} reduce_op_t;
+
 struct ir3_instruction {
    struct ir3_block *block;
    opc_t opc;
@@ -324,6 +340,7 @@ struct ir3_instruction {
       struct {
          type_t src_type, dst_type;
          round_t round;
+         reduce_op_t reduce_op;
       } cat1;
       struct {
          enum {
@@ -896,6 +913,7 @@ is_subgroup_cond_mov_macro(struct ir3_instruction *instr)
    case OPC_READ_COND_MACRO:
    case OPC_READ_FIRST_MACRO:
    case OPC_SWZ_SHARED_MACRO:
+   case OPC_SCAN_MACRO:
       return true;
    default:
       return false;
index 0995480..0e73063 100644 (file)
@@ -1823,6 +1823,148 @@ get_frag_coord(struct ir3_context *ctx, nir_intrinsic_instr *intr)
    return ctx->frag_coord;
 }
 
+/* This is a bit of a hack until ir3_context is converted to store SSA values
+ * as ir3_register's instead of ir3_instruction's. Pick out a given destination
+ * of an instruction with multiple destinations using a mov that will get folded
+ * away by ir3_cp.
+ */
+static struct ir3_instruction *
+create_multidst_mov(struct ir3_block *block, struct ir3_register *dst)
+{
+   struct ir3_instruction *mov = ir3_instr_create(block, OPC_MOV, 1, 1);
+   unsigned dst_flags = dst->flags & IR3_REG_HALF;
+   unsigned src_flags = dst->flags & (IR3_REG_HALF | IR3_REG_SHARED);
+
+   __ssa_dst(mov)->flags |= dst_flags;
+   struct ir3_register *src =
+      ir3_src_create(mov, INVALID_REG, IR3_REG_SSA | src_flags);
+   src->wrmask = dst->wrmask;
+   src->def = dst;
+   debug_assert(!(dst->flags & IR3_REG_RELATIV));
+   mov->cat1.src_type = mov->cat1.dst_type =
+      (dst->flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32;
+   return mov;
+}
+
+static reduce_op_t
+get_reduce_op(nir_op opc)
+{
+   switch (opc) {
+   case nir_op_iadd: return REDUCE_OP_ADD_U;
+   case nir_op_fadd: return REDUCE_OP_ADD_F;
+   case nir_op_imul: return REDUCE_OP_MUL_U;
+   case nir_op_fmul: return REDUCE_OP_MUL_F;
+   case nir_op_umin: return REDUCE_OP_MIN_U;
+   case nir_op_imin: return REDUCE_OP_MIN_S;
+   case nir_op_fmin: return REDUCE_OP_MIN_F;
+   case nir_op_umax: return REDUCE_OP_MAX_U;
+   case nir_op_imax: return REDUCE_OP_MAX_S;
+   case nir_op_fmax: return REDUCE_OP_MAX_F;
+   case nir_op_iand: return REDUCE_OP_AND_B;
+   case nir_op_ior:  return REDUCE_OP_OR_B;
+   case nir_op_ixor: return REDUCE_OP_XOR_B;
+   default:
+      unreachable("unknown NIR reduce op");
+   }
+}
+
+static uint32_t
+get_reduce_identity(nir_op opc, unsigned size)
+{
+   switch (opc) {
+   case nir_op_iadd:
+      return 0;
+   case nir_op_fadd: 
+      return size == 32 ? fui(0.0f) : _mesa_float_to_half(0.0f);
+   case nir_op_imul:
+      return 1;
+   case nir_op_fmul:
+      return size == 32 ? fui(1.0f) : _mesa_float_to_half(1.0f);
+   case nir_op_umax:
+      return 0;
+   case nir_op_imax:
+      return size == 32 ? INT32_MIN : (uint32_t)INT16_MIN;
+   case nir_op_fmax:
+      return size == 32 ? fui(-INFINITY) : _mesa_float_to_half(-INFINITY);
+   case nir_op_umin:
+      return size == 32 ? UINT32_MAX : UINT16_MAX;
+   case nir_op_imin:
+      return size == 32 ? INT32_MAX : (uint32_t)INT16_MAX;
+   case nir_op_fmin:
+      return size == 32 ? fui(INFINITY) : _mesa_float_to_half(INFINITY);
+   case nir_op_iand:
+      return size == 32 ? ~0 : (size == 16 ? (uint32_t)(uint16_t)~0 : 1);
+   case nir_op_ior:
+      return 0;
+   case nir_op_ixor:
+      return 0;
+   default:
+      unreachable("unknown NIR reduce op");
+   }
+}
+
+static struct ir3_instruction *
+emit_intrinsic_reduce(struct ir3_context *ctx, nir_intrinsic_instr *intr)
+{
+   struct ir3_instruction *src = ir3_get_src(ctx, &intr->src[0])[0];
+   nir_op nir_reduce_op = (nir_op) nir_intrinsic_reduction_op(intr);
+   reduce_op_t reduce_op = get_reduce_op(nir_reduce_op);
+   unsigned dst_size = nir_dest_bit_size(intr->dest);
+   unsigned flags = (ir3_bitsize(ctx, dst_size) == 16) ? IR3_REG_HALF : 0;
+
+   /* Note: the shared reg is initialized to the identity, so we need it to
+    * always be 32-bit even when the source isn't because half shared regs are
+    * not supported.
+    */
+   struct ir3_instruction *identity =
+      create_immed(ctx->block, get_reduce_identity(nir_reduce_op, dst_size));
+   identity = ir3_READ_FIRST_MACRO(ctx->block, identity, 0);
+   identity->dsts[0]->flags |= IR3_REG_SHARED;
+
+   /* OPC_SCAN_MACRO has the following destinations:
+    * - Exclusive scan result (interferes with source)
+    * - Inclusive scan result
+    * - Shared reg reduction result, must be initialized to the identity
+    *
+    * The loop computes all three results at the same time, we just have to
+    * choose which destination to return.
+    */
+   struct ir3_instruction *scan =
+      ir3_instr_create(ctx->block, OPC_SCAN_MACRO, 3, 2);
+   scan->cat1.reduce_op = reduce_op;
+
+   struct ir3_register *exclusive = __ssa_dst(scan);
+   exclusive->flags |= flags | IR3_REG_EARLY_CLOBBER;
+   struct ir3_register *inclusive = __ssa_dst(scan);
+   inclusive->flags |= flags;
+   struct ir3_register *reduce = __ssa_dst(scan);
+   reduce->flags |= IR3_REG_SHARED;
+
+   /* The 32-bit multiply macro reads its sources after writing a partial result
+    * to the destination, therefore inclusive also interferes with the source.
+    */
+   if (reduce_op == REDUCE_OP_MUL_U && dst_size == 32)
+      inclusive->flags |= IR3_REG_EARLY_CLOBBER;
+
+   /* Normal source */
+   __ssa_src(scan, src, 0);
+
+   /* shared reg tied source */
+   struct ir3_register *reduce_init = __ssa_src(scan, identity, IR3_REG_SHARED);
+   ir3_reg_tie(reduce, reduce_init);
+   
+   struct ir3_register *dst;
+   switch (intr->intrinsic) {
+   case nir_intrinsic_reduce: dst = reduce; break;
+   case nir_intrinsic_inclusive_scan: dst = inclusive; break;
+   case nir_intrinsic_exclusive_scan: dst = exclusive; break;
+   default:
+      unreachable("unknown reduce intrinsic");
+   }
+
+   return create_multidst_mov(ctx->block, dst);
+}
+
 static void setup_input(struct ir3_context *ctx, nir_intrinsic_instr *intr);
 static void setup_output(struct ir3_context *ctx, nir_intrinsic_instr *intr);
 
@@ -2425,6 +2567,12 @@ emit_intrinsic(struct ir3_context *ctx, nir_intrinsic_instr *intr)
       break;
    }
 
+   case nir_intrinsic_reduce:
+   case nir_intrinsic_inclusive_scan:
+   case nir_intrinsic_exclusive_scan:
+      dst[0] = emit_intrinsic_reduce(ctx, intr);
+      break;
+
    default:
       ir3_context_error(ctx, "Unhandled intrinsic type: %s\n",
                         nir_intrinsic_infos[intr->intrinsic].name);
index 041be19..afc88a1 100644 (file)
@@ -71,14 +71,106 @@ mov_immed(struct ir3_register *dst, struct ir3_block *block, unsigned immed)
    mov->repeat = util_last_bit(mov_dst->wrmask) - 1;
 }
 
+static void
+mov_reg(struct ir3_block *block, struct ir3_register *dst,
+        struct ir3_register *src)
+{
+   struct ir3_instruction *mov = ir3_instr_create(block, OPC_MOV, 1, 1);
+
+   struct ir3_register *mov_dst =
+      ir3_dst_create(mov, dst->num, dst->flags & (IR3_REG_HALF | IR3_REG_SHARED));
+   struct ir3_register *mov_src =
+      ir3_src_create(mov, src->num, src->flags & (IR3_REG_HALF | IR3_REG_SHARED));
+   mov_dst->wrmask = dst->wrmask;
+   mov_src->wrmask = src->wrmask;
+   mov->repeat = util_last_bit(mov_dst->wrmask) - 1;
+
+   mov->cat1.dst_type = (dst->flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32;
+   mov->cat1.src_type = (src->flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32;
+}
+
+static void
+binop(struct ir3_block *block, opc_t opc, struct ir3_register *dst,
+      struct ir3_register *src0, struct ir3_register *src1)
+{
+   struct ir3_instruction *instr = ir3_instr_create(block, opc, 1, 2);
+   
+   unsigned flags = dst->flags & IR3_REG_HALF;
+   struct ir3_register *instr_dst = ir3_dst_create(instr, dst->num, flags);
+   struct ir3_register *instr_src0 = ir3_src_create(instr, src0->num, flags);
+   struct ir3_register *instr_src1 = ir3_src_create(instr, src1->num, flags);
+
+   instr_dst->wrmask = dst->wrmask;
+   instr_src0->wrmask = src0->wrmask;
+   instr_src1->wrmask = src1->wrmask;
+   instr->repeat = util_last_bit(instr_dst->wrmask) - 1;
+}
+
+static void
+triop(struct ir3_block *block, opc_t opc, struct ir3_register *dst,
+      struct ir3_register *src0, struct ir3_register *src1,
+      struct ir3_register *src2)
+{
+   struct ir3_instruction *instr = ir3_instr_create(block, opc, 1, 3);
+   
+   unsigned flags = dst->flags & IR3_REG_HALF;
+   struct ir3_register *instr_dst = ir3_dst_create(instr, dst->num, flags);
+   struct ir3_register *instr_src0 = ir3_src_create(instr, src0->num, flags);
+   struct ir3_register *instr_src1 = ir3_src_create(instr, src1->num, flags);
+   struct ir3_register *instr_src2 = ir3_src_create(instr, src2->num, flags);
+
+   instr_dst->wrmask = dst->wrmask;
+   instr_src0->wrmask = src0->wrmask;
+   instr_src1->wrmask = src1->wrmask;
+   instr_src2->wrmask = src2->wrmask;
+   instr->repeat = util_last_bit(instr_dst->wrmask) - 1;
+}
+
+static void
+do_reduce(struct ir3_block *block, reduce_op_t opc,
+          struct ir3_register *dst, struct ir3_register *src0,
+          struct ir3_register *src1)
+{
+   switch (opc) {
+#define CASE(name)                                                             \
+   case REDUCE_OP_##name:                                                      \
+      binop(block, OPC_##name, dst, src0, src1);                               \
+      break;
+
+   CASE(ADD_U)
+   CASE(ADD_F)
+   CASE(MUL_F)
+   CASE(MIN_U)
+   CASE(MIN_S)
+   CASE(MIN_F)
+   CASE(MAX_U)
+   CASE(MAX_S)
+   CASE(MAX_F)
+   CASE(AND_B)
+   CASE(OR_B)
+   CASE(XOR_B)
+
+#undef CASE
+
+   case REDUCE_OP_MUL_U:
+      if (dst->flags & IR3_REG_HALF) {
+         binop(block, OPC_MUL_S24, dst, src0, src1);
+      } else {
+         /* 32-bit multiplication macro - see ir3_nir_imul */
+         binop(block, OPC_MULL_U, dst, src0, src1);
+         triop(block, OPC_MADSH_M16, dst, src0, src1, dst);
+         triop(block, OPC_MADSH_M16, dst, src1, src0, dst);
+      }
+      break;
+   }
+}
+
 static struct ir3_block *
 split_block(struct ir3 *ir, struct ir3_block *before_block,
-            struct ir3_instruction *instr, struct ir3_block **then)
+            struct ir3_instruction *instr)
 {
-   struct ir3_block *then_block = ir3_block_create(ir);
    struct ir3_block *after_block = ir3_block_create(ir);
-   list_add(&then_block->node, &before_block->node);
-   list_add(&after_block->node, &then_block->node);
+   list_add(&after_block->node, &before_block->node);
 
    for (unsigned i = 0; i < ARRAY_SIZE(before_block->successors); i++) {
       after_block->successors[i] = before_block->successors[i];
@@ -96,19 +188,8 @@ split_block(struct ir3 *ir, struct ir3_block *before_block,
       }
    }
 
-   before_block->successors[0] = then_block;
-   before_block->successors[1] = after_block;
-   before_block->physical_successors[0] = then_block;
-   before_block->physical_successors[1] = after_block;
-   ir3_block_add_predecessor(then_block, before_block);
-   ir3_block_add_predecessor(after_block, before_block);
-   ir3_block_add_physical_predecessor(then_block, before_block);
-   ir3_block_add_physical_predecessor(after_block, before_block);
-
-   then_block->successors[0] = after_block;
-   then_block->physical_successors[0] = after_block;
-   ir3_block_add_predecessor(after_block, then_block);
-   ir3_block_add_physical_predecessor(after_block, then_block);
+   before_block->successors[0] = before_block->successors[1] = NULL;
+   before_block->physical_successors[0] = before_block->physical_successors[1] = NULL;
 
    foreach_instr_from_safe (rem_instr, &instr->node,
                             &before_block->instr_list) {
@@ -120,10 +201,39 @@ split_block(struct ir3 *ir, struct ir3_block *before_block,
    after_block->brtype = before_block->brtype;
    after_block->condition = before_block->condition;
 
-   *then = then_block;
    return after_block;
 }
 
+static void
+link_blocks_physical(struct ir3_block *pred, struct ir3_block *succ,
+                     unsigned index)
+{
+   pred->physical_successors[index] = succ;
+   ir3_block_add_physical_predecessor(succ, pred);
+}
+
+static void
+link_blocks(struct ir3_block *pred, struct ir3_block *succ, unsigned index)
+{
+   pred->successors[index] = succ;
+   ir3_block_add_predecessor(succ, pred);
+   link_blocks_physical(pred, succ, index);
+}
+
+static struct ir3_block *
+create_if(struct ir3 *ir, struct ir3_block *before_block,
+          struct ir3_block *after_block)
+{
+   struct ir3_block *then_block = ir3_block_create(ir);
+   list_add(&then_block->node, &before_block->node);
+
+   link_blocks(before_block, then_block, 0);
+   link_blocks(before_block, after_block, 1);
+   link_blocks(then_block, after_block, 0);
+
+   return then_block;
+}
+
 static bool
 lower_instr(struct ir3 *ir, struct ir3_block **block, struct ir3_instruction *instr)
 {
@@ -135,106 +245,156 @@ lower_instr(struct ir3 *ir, struct ir3_block **block, struct ir3_instruction *in
    case OPC_READ_COND_MACRO:
    case OPC_READ_FIRST_MACRO:
    case OPC_SWZ_SHARED_MACRO:
+   case OPC_SCAN_MACRO:
       break;
    default:
       return false;
    }
 
    struct ir3_block *before_block = *block;
-   struct ir3_block *then_block;
-   struct ir3_block *after_block =
-      split_block(ir, before_block, instr, &then_block);
-
-   /* For ballot, the destination must be initialized to 0 before we do
-    * the movmsk because the condition may be 0 and then the movmsk will
-    * be skipped. Because it's a shared register we have to wrap the
-    * initialization in a getone block.
-    */
-   if (instr->opc == OPC_BALLOT_MACRO) {
-      before_block->brtype = IR3_BRANCH_GETONE;
-      before_block->condition = NULL;
-      mov_immed(instr->dsts[0], then_block, 0);
-      before_block = after_block;
-      after_block = split_block(ir, before_block, instr, &then_block);
-   }
+   struct ir3_block *after_block = split_block(ir, before_block, instr);
 
-   switch (instr->opc) {
-   case OPC_BALLOT_MACRO:
-   case OPC_READ_COND_MACRO:
-   case OPC_ANY_MACRO:
-   case OPC_ALL_MACRO:
-      before_block->condition = instr->srcs[0]->def->instr;
-      break;
-   default:
-      before_block->condition = NULL;
-      break;
-   }
+   if (instr->opc == OPC_SCAN_MACRO) {
+      /* The pseudo-code for the scan macro is:
+       *
+       * while (true) {
+       *    header:
+       *    if (elect()) {
+       *       exit:
+       *       exclusive = reduce;
+       *       inclusive = src OP exclusive;
+       *       reduce = inclusive;
+       *    }
+       *    footer:
+       * }
+       *
+       * This is based on the blob's sequence, and carefully crafted to avoid
+       * using the shared register "reduce" except in move instructions, since
+       * using it in the actual OP isn't possible for half-registers.
+       */
+      struct ir3_block *header = ir3_block_create(ir);
+      list_add(&header->node, &before_block->node);
 
-   switch (instr->opc) {
-   case OPC_BALLOT_MACRO:
-   case OPC_READ_COND_MACRO:
-      before_block->brtype = IR3_BRANCH_COND;
-      break;
-   case OPC_ANY_MACRO:
-      before_block->brtype = IR3_BRANCH_ANY;
-      break;
-   case OPC_ALL_MACRO:
-      before_block->brtype = IR3_BRANCH_ALL;
-      break;
-   case OPC_ELECT_MACRO:
-   case OPC_READ_FIRST_MACRO:
-   case OPC_SWZ_SHARED_MACRO:
-      before_block->brtype = IR3_BRANCH_GETONE;
-      break;
-   default:
-      unreachable("bad opcode");
-   }
+      struct ir3_block *exit = ir3_block_create(ir);
+      list_add(&exit->node, &header->node);
 
-   switch (instr->opc) {
-   case OPC_ALL_MACRO:
-   case OPC_ANY_MACRO:
-   case OPC_ELECT_MACRO:
-      mov_immed(instr->dsts[0], then_block, 1);
-      mov_immed(instr->dsts[0], before_block, 0);
-      break;
+      struct ir3_block *footer = ir3_block_create(ir);
+      list_add(&footer->node, &exit->node);
 
-   case OPC_BALLOT_MACRO: {
-      unsigned comp_count = util_last_bit(instr->dsts[0]->wrmask);
-      struct ir3_instruction *movmsk =
-         ir3_instr_create(then_block, OPC_MOVMSK, 1, 0);
-      ir3_dst_create(movmsk, instr->dsts[0]->num, instr->dsts[0]->flags);
-      movmsk->repeat = comp_count - 1;
-      break;
-   }
+      link_blocks(before_block, header, 0);
 
-   case OPC_READ_COND_MACRO:
-   case OPC_READ_FIRST_MACRO: {
-      struct ir3_instruction *mov =
-         ir3_instr_create(then_block, OPC_MOV, 1, 1);
-      unsigned src = instr->opc == OPC_READ_COND_MACRO ? 1 : 0;
-      ir3_dst_create(mov, instr->dsts[0]->num, instr->dsts[0]->flags);
-      struct ir3_register *new_src = ir3_src_create(mov, 0, 0);
-      *new_src = *instr->srcs[src];
-      mov->cat1.dst_type = TYPE_U32;
-      mov->cat1.src_type =
-         (new_src->flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32;
-      break;
-   }
+      link_blocks(header, exit, 0);
+      link_blocks(header, footer, 1);
+      header->brtype = IR3_BRANCH_GETONE;
 
-   case OPC_SWZ_SHARED_MACRO: {
-      struct ir3_instruction *swz =
-         ir3_instr_create(then_block, OPC_SWZ, 2, 2);
-      ir3_dst_create(swz, instr->dsts[0]->num, instr->dsts[0]->flags);
-      ir3_dst_create(swz, instr->dsts[1]->num, instr->dsts[1]->flags);
-      ir3_src_create(swz, instr->srcs[0]->num, instr->srcs[0]->flags);
-      ir3_src_create(swz, instr->srcs[1]->num, instr->srcs[1]->flags);
-      swz->cat1.dst_type = swz->cat1.src_type = TYPE_U32;
-      swz->repeat = 1;
-      break;
-   }
+      link_blocks(exit, after_block, 0);
+      link_blocks_physical(exit, footer, 1);
 
-   default:
-      unreachable("bad opcode");
+      link_blocks(footer, header, 0);
+
+      struct ir3_register *exclusive = instr->dsts[0];
+      struct ir3_register *inclusive = instr->dsts[1];
+      struct ir3_register *reduce = instr->dsts[2];
+      struct ir3_register *src = instr->srcs[0];
+
+      mov_reg(exit, exclusive, reduce);
+      do_reduce(exit, instr->cat1.reduce_op, inclusive, src, exclusive);
+      mov_reg(exit, reduce, inclusive);
+   } else {
+      struct ir3_block *then_block = create_if(ir, before_block, after_block);
+
+      /* For ballot, the destination must be initialized to 0 before we do
+       * the movmsk because the condition may be 0 and then the movmsk will
+       * be skipped. Because it's a shared register we have to wrap the
+       * initialization in a getone block.
+       */
+      if (instr->opc == OPC_BALLOT_MACRO) {
+         before_block->brtype = IR3_BRANCH_GETONE;
+         before_block->condition = NULL;
+         mov_immed(instr->dsts[0], then_block, 0);
+         before_block = after_block;
+         after_block = split_block(ir, before_block, instr);
+         then_block = create_if(ir, before_block, after_block);
+      }
+
+      switch (instr->opc) {
+      case OPC_BALLOT_MACRO:
+      case OPC_READ_COND_MACRO:
+      case OPC_ANY_MACRO:
+      case OPC_ALL_MACRO:
+         before_block->condition = instr->srcs[0]->def->instr;
+         break;
+      default:
+         before_block->condition = NULL;
+         break;
+      }
+
+      switch (instr->opc) {
+      case OPC_BALLOT_MACRO:
+      case OPC_READ_COND_MACRO:
+         before_block->brtype = IR3_BRANCH_COND;
+         break;
+      case OPC_ANY_MACRO:
+         before_block->brtype = IR3_BRANCH_ANY;
+         break;
+      case OPC_ALL_MACRO:
+         before_block->brtype = IR3_BRANCH_ALL;
+         break;
+      case OPC_ELECT_MACRO:
+      case OPC_READ_FIRST_MACRO:
+      case OPC_SWZ_SHARED_MACRO:
+         before_block->brtype = IR3_BRANCH_GETONE;
+         break;
+      default:
+         unreachable("bad opcode");
+      }
+
+      switch (instr->opc) {
+      case OPC_ALL_MACRO:
+      case OPC_ANY_MACRO:
+      case OPC_ELECT_MACRO:
+         mov_immed(instr->dsts[0], then_block, 1);
+         mov_immed(instr->dsts[0], before_block, 0);
+         break;
+
+      case OPC_BALLOT_MACRO: {
+         unsigned comp_count = util_last_bit(instr->dsts[0]->wrmask);
+         struct ir3_instruction *movmsk =
+            ir3_instr_create(then_block, OPC_MOVMSK, 1, 0);
+         ir3_dst_create(movmsk, instr->dsts[0]->num, instr->dsts[0]->flags);
+         movmsk->repeat = comp_count - 1;
+         break;
+      }
+
+      case OPC_READ_COND_MACRO:
+      case OPC_READ_FIRST_MACRO: {
+         struct ir3_instruction *mov =
+            ir3_instr_create(then_block, OPC_MOV, 1, 1);
+         unsigned src = instr->opc == OPC_READ_COND_MACRO ? 1 : 0;
+         ir3_dst_create(mov, instr->dsts[0]->num, instr->dsts[0]->flags);
+         struct ir3_register *new_src = ir3_src_create(mov, 0, 0);
+         *new_src = *instr->srcs[src];
+         mov->cat1.dst_type = TYPE_U32;
+         mov->cat1.src_type =
+            (new_src->flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32;
+         break;
+      }
+
+      case OPC_SWZ_SHARED_MACRO: {
+         struct ir3_instruction *swz =
+            ir3_instr_create(then_block, OPC_SWZ, 2, 2);
+         ir3_dst_create(swz, instr->dsts[0]->num, instr->dsts[0]->flags);
+         ir3_dst_create(swz, instr->dsts[1]->num, instr->dsts[1]->flags);
+         ir3_src_create(swz, instr->srcs[0]->num, instr->srcs[0]->flags);
+         ir3_src_create(swz, instr->srcs[1]->num, instr->srcs[1]->flags);
+         swz->cat1.dst_type = swz->cat1.src_type = TYPE_U32;
+         swz->repeat = 1;
+         break;
+      }
+
+      default:
+         unreachable("bad opcode");
+      }
    }
 
    *block = after_block;
index 48f7cdb..431ae3c 100644 (file)
@@ -137,7 +137,51 @@ print_instr_name(struct log_stream *stream, struct ir3_instruction *instr,
                                 disasm_a3xx_instr_name(instr->opc));
       }
 
-      if (instr->opc != OPC_MOVMSK) {
+      if (instr->opc == OPC_SCAN_MACRO) {
+         switch (instr->cat1.reduce_op) {
+         case REDUCE_OP_ADD_U:
+            mesa_log_stream_printf(stream, ".add.u");
+            break;
+         case REDUCE_OP_ADD_F:
+            mesa_log_stream_printf(stream, ".add.f");
+            break;
+         case REDUCE_OP_MUL_U:
+            mesa_log_stream_printf(stream, ".mul.u");
+            break;
+         case REDUCE_OP_MUL_F:
+            mesa_log_stream_printf(stream, ".mul.f");
+            break;
+         case REDUCE_OP_MIN_U:
+            mesa_log_stream_printf(stream, ".min.u");
+            break;
+         case REDUCE_OP_MIN_S:
+            mesa_log_stream_printf(stream, ".min.s");
+            break;
+         case REDUCE_OP_MIN_F:
+            mesa_log_stream_printf(stream, ".min.f");
+            break;
+         case REDUCE_OP_MAX_U:
+            mesa_log_stream_printf(stream, ".max.u");
+            break;
+         case REDUCE_OP_MAX_S:
+            mesa_log_stream_printf(stream, ".max.s");
+            break;
+         case REDUCE_OP_MAX_F:
+            mesa_log_stream_printf(stream, ".max.f");
+            break;
+         case REDUCE_OP_AND_B:
+            mesa_log_stream_printf(stream, ".and.b");
+            break;
+         case REDUCE_OP_OR_B:
+            mesa_log_stream_printf(stream, ".or.b");
+            break;
+         case REDUCE_OP_XOR_B:
+            mesa_log_stream_printf(stream, ".xor.b");
+            break;
+         }
+      }
+
+      if (instr->opc != OPC_MOVMSK && instr->opc != OPC_SCAN_MACRO) {
          mesa_log_stream_printf(stream, ".%s%s",
                                 type_name(instr->cat1.src_type),
                                 type_name(instr->cat1.dst_type));
index f10116a..b842dae 100644 (file)
@@ -238,6 +238,14 @@ validate_instr(struct ir3_validate_ctx *ctx, struct ir3_instruction *instr)
       } else if (instr->opc == OPC_ELECT_MACRO) {
          validate_assert(ctx, instr->dsts_count == 1);
          validate_assert(ctx, !(instr->dsts[0]->flags & IR3_REG_SHARED));
+      } else if (instr->opc == OPC_SCAN_MACRO) {
+         validate_assert(ctx, instr->dsts_count == 3);
+         validate_assert(ctx, instr->srcs_count == 2);
+         validate_assert(ctx, reg_class_flags(instr->dsts[0]) ==
+                              reg_class_flags(instr->srcs[0]));
+         validate_assert(ctx, reg_class_flags(instr->dsts[1]) ==
+                              reg_class_flags(instr->srcs[0]));
+         validate_assert(ctx, reg_class_flags(instr->dsts[2]) == IR3_REG_SHARED);
       } else {
          foreach_dst (dst, instr)
             validate_reg_size(ctx, dst, instr->cat1.dst_type);