From 97dcad8d3e6cce946a71d2fdf0ffe65d113d64e1 Mon Sep 17 00:00:00 2001
From: Alyssa Rosenzweig <alyssa@rosenzweig.io>
Date: Thu, 7 Feb 2019 03:39:25 +0000
Subject: [PATCH] panfrost: Clean-up one-argument passing quirk

Most Midgard instructions take two-arguments logically; there are always
two arguments at the assembly level. For the few instructions that take
only a single argument, generally the second argument slot is unused,
with a zero inline constant occupying the space. fmov/imov are the
exception, where the first argument is filled with r24 and the logical
argument is in the second slot.

Previously, these constraints were handled by a delicate, buggy series
of hacks. This commit removes these hacks. Instead, we look at the
logical number of arguments (from NIR), switching between two argument
and one-argument-one-zero style. We then introduce a quirk for the
flipped style, which applies to fmov/imov.

Signed-off-by: Alyssa Rosenzweig <alyssa@rosenzweig.io>
---
 src/gallium/drivers/panfrost/midgard/helpers.h     |  53 ++++---
 .../drivers/panfrost/midgard/midgard_compile.c     | 173 ++++++++++-----------
 2 files changed, 112 insertions(+), 114 deletions(-)

diff --git a/src/gallium/drivers/panfrost/midgard/helpers.h b/src/gallium/drivers/panfrost/midgard/helpers.h
index 9e365dc..b8ccea8 100644
--- a/src/gallium/drivers/panfrost/midgard/helpers.h
+++ b/src/gallium/drivers/panfrost/midgard/helpers.h
@@ -52,6 +52,14 @@
 #define OP_CHANNEL_COUNT(c) ((c - 1) << 0)
 #define GET_CHANNEL_COUNT(c) ((c & (0x3 << 0)) ? ((c & (0x3 << 0)) + 1) : 0)
 
+/* For instructions that take a single argument, normally the first argument
+ * slot is used for the argument and the second slot is a dummy #0 constant.
+ * However, there are exceptions: instructions like fmov store their argument
+ * in the _second_ slot and store a dummy r24 in the first slot, designated by
+ * QUIRK_FLIPPED_R24 */
+
+#define QUIRK_FLIPPED_R24 (1 << 2)
+
 /* Vector-independant shorthands for the above; these numbers are arbitrary and
  * not from the ISA. Convert to the above with unit_enum_to_midgard */
 
@@ -162,24 +170,24 @@ midgard_is_integer_op(int op)
 #define UNIT_SMUL ALU_ENAB_SCAL_MUL
 #define UNIT_VLUT ALU_ENAB_VEC_LUT
 
-/* Shorthands for usual combinations of units. LUT is intentionally excluded
- * since it's nutty. */
+/* Shorthands for usual combinations of units */
 
 #define UNITS_MUL (UNIT_VMUL | UNIT_SMUL)
 #define UNITS_ADD (UNIT_VADD | UNIT_SADD)
-#define UNITS_ALL (UNITS_MUL | UNITS_ADD)
+#define UNITS_MOST (UNITS_MUL | UNITS_ADD)
+#define UNITS_ALL (UNITS_MOST | UNIT_VLUT)
 #define UNITS_SCALAR (UNIT_SADD | UNIT_SMUL)
 #define UNITS_VECTOR (UNIT_VMUL | UNIT_VADD)
 #define UNITS_ANY_VECTOR (UNITS_VECTOR | UNIT_VLUT)
 
-static int alu_opcode_props[256] = {
+static unsigned alu_opcode_props[256] = {
         [midgard_alu_op_fadd]		 = UNITS_ADD,
         [midgard_alu_op_fmul]		 = UNITS_MUL | UNIT_VLUT,
         [midgard_alu_op_fmin]		 = UNITS_MUL | UNITS_ADD,
         [midgard_alu_op_fmax]		 = UNITS_MUL | UNITS_ADD,
-        [midgard_alu_op_imin]		 = UNITS_ALL,
-        [midgard_alu_op_imax]		 = UNITS_ALL,
-        [midgard_alu_op_fmov]		 = UNITS_ALL | UNIT_VLUT,
+        [midgard_alu_op_imin]		 = UNITS_MOST,
+        [midgard_alu_op_imax]		 = UNITS_MOST,
+        [midgard_alu_op_fmov]		 = UNITS_ALL | QUIRK_FLIPPED_R24,
         [midgard_alu_op_ffloor]		 = UNITS_ADD,
         [midgard_alu_op_fceil]		 = UNITS_ADD,
 
@@ -188,19 +196,20 @@ static int alu_opcode_props[256] = {
         [midgard_alu_op_fdot3]		 = UNIT_VMUL | OP_CHANNEL_COUNT(3),
         [midgard_alu_op_fdot4]		 = UNIT_VMUL | OP_CHANNEL_COUNT(4),
 
-        [midgard_alu_op_iadd]		 = UNITS_ADD,
-        [midgard_alu_op_isub]		 = UNITS_ADD,
-        [midgard_alu_op_imul]		 = UNITS_ALL,
-        [midgard_alu_op_imov]		 = UNITS_ALL,
+        /* Incredibly, iadd can run on vmul, etc */
+        [midgard_alu_op_iadd]		 = UNITS_MOST,
+        [midgard_alu_op_isub]		 = UNITS_MOST,
+        [midgard_alu_op_imul]		 = UNITS_MOST,
+        [midgard_alu_op_imov]		 = UNITS_MOST | QUIRK_FLIPPED_R24,
 
         /* For vector comparisons, use ball etc */
-        [midgard_alu_op_feq]		 = UNITS_ALL,
-        [midgard_alu_op_fne]		 = UNITS_ALL,
+        [midgard_alu_op_feq]		 = UNITS_MOST,
+        [midgard_alu_op_fne]		 = UNITS_MOST,
         [midgard_alu_op_flt]		 = UNIT_SADD,
-        [midgard_alu_op_ieq]		 = UNITS_ALL,
-        [midgard_alu_op_ine]		 = UNITS_ALL,
-        [midgard_alu_op_ilt]		 = UNITS_ALL,
-        [midgard_alu_op_ile]		 = UNITS_ALL,
+        [midgard_alu_op_ieq]		 = UNITS_MOST,
+        [midgard_alu_op_ine]		 = UNITS_MOST,
+        [midgard_alu_op_ilt]		 = UNITS_MOST,
+        [midgard_alu_op_ile]		 = UNITS_MOST,
 
         [midgard_alu_op_icsel]		 = UNITS_ADD,
         [midgard_alu_op_fcsel]		 = UNITS_ADD | UNIT_SMUL,
@@ -223,14 +232,14 @@ static int alu_opcode_props[256] = {
         [midgard_alu_op_iand]		 = UNITS_ADD, /* XXX: Test case where it's right on smul but not sadd */
         [midgard_alu_op_ior]		 = UNITS_ADD,
         [midgard_alu_op_ixor]		 = UNITS_ADD,
-        [midgard_alu_op_inot]		 = UNITS_ALL,
+        [midgard_alu_op_inot]		 = UNITS_MOST,
         [midgard_alu_op_ishl]		 = UNITS_ADD,
         [midgard_alu_op_iasr]		 = UNITS_ADD,
         [midgard_alu_op_ilsr]		 = UNITS_ADD,
         [midgard_alu_op_ilsr]		 = UNITS_ADD,
 
-        [midgard_alu_op_fball_eq]	 = UNITS_ALL,
-        [midgard_alu_op_fbany_neq]	 = UNITS_ALL,
-        [midgard_alu_op_iball_eq]	 = UNITS_ALL,
-        [midgard_alu_op_ibany_neq]	 = UNITS_ALL
+        [midgard_alu_op_fball_eq]	 = UNITS_MOST,
+        [midgard_alu_op_fbany_neq]	 = UNITS_MOST,
+        [midgard_alu_op_iball_eq]	 = UNITS_MOST,
+        [midgard_alu_op_ibany_neq]	 = UNITS_MOST
 };
diff --git a/src/gallium/drivers/panfrost/midgard/midgard_compile.c b/src/gallium/drivers/panfrost/midgard/midgard_compile.c
index 07e3513..1e390d2 100644
--- a/src/gallium/drivers/panfrost/midgard/midgard_compile.c
+++ b/src/gallium/drivers/panfrost/midgard/midgard_compile.c
@@ -888,18 +888,8 @@ emit_condition(compiler_context *ctx, nir_src *src, bool for_branch)
         emit_mir_instruction(ctx, ins);
 }
 
-/* Components: Number/style of arguments:
- * 	3: One-argument op with r24 (i2f, f2i)
- * 	2: Standard two argument op (fadd, fmul)
- * 	1: Flipped one-argument op (fmov, imov)
- * 	0: Standard one-argument op (frcp)
- * NIR: NIR instruction op.
- * Op: Midgard instruction op.
- */
-
-#define ALU_CASE(_components, nir, _op) \
+#define ALU_CASE(nir, _op) \
 	case nir_op_##nir: \
-		components = _components; \
 		op = midgard_alu_op_##_op; \
 		break;
 
@@ -910,6 +900,7 @@ emit_alu(compiler_context *ctx, nir_alu_instr *instr)
 
         unsigned dest = nir_dest_index(&instr->dest.dest);
         unsigned nr_components = is_ssa ? instr->dest.dest.ssa.num_components : instr->dest.dest.reg.reg->num_components;
+        unsigned nr_inputs = nir_op_infos[instr->op].num_inputs;
 
         /* Most Midgard ALU ops have a 1:1 correspondance to NIR ops; these are
          * supported. A few do not and are commented for now. Also, there are a
@@ -918,70 +909,66 @@ emit_alu(compiler_context *ctx, nir_alu_instr *instr)
          * convention of the Midgard instruction; actual packing is done in
          * emit_alu below */
 
-        unsigned op, components;
+        unsigned op;
 
         switch (instr->op) {
-                ALU_CASE(2, fadd, fadd);
-                ALU_CASE(2, fmul, fmul);
-                ALU_CASE(2, fmin, fmin);
-                ALU_CASE(2, fmax, fmax);
-                ALU_CASE(2, imin, imin);
-                ALU_CASE(2, imax, imax);
-                ALU_CASE(1, fmov, fmov);
-                ALU_CASE(0, ffloor, ffloor);
-                ALU_CASE(0, fceil, fceil);
-                ALU_CASE(2, fdot3, fdot3);
-                //ALU_CASE(2, fdot3r);
-                ALU_CASE(2, fdot4, fdot4);
-                //ALU_CASE(2, freduce);
-                ALU_CASE(2, iadd, iadd);
-                ALU_CASE(2, isub, isub);
-                ALU_CASE(2, imul, imul);
+                ALU_CASE(fadd, fadd);
+                ALU_CASE(fmul, fmul);
+                ALU_CASE(fmin, fmin);
+                ALU_CASE(fmax, fmax);
+                ALU_CASE(imin, imin);
+                ALU_CASE(imax, imax);
+                ALU_CASE(fmov, fmov);
+                ALU_CASE(ffloor, ffloor);
+                ALU_CASE(fceil, fceil);
+                ALU_CASE(fdot3, fdot3);
+                ALU_CASE(fdot4, fdot4);
+                ALU_CASE(iadd, iadd);
+                ALU_CASE(isub, isub);
+                ALU_CASE(imul, imul);
 
                 /* XXX: Use fmov, not imov, since imov was causing major
                  * issues with texture precision? XXX research */
-                ALU_CASE(1, imov, fmov);
-
-                ALU_CASE(2, feq, feq);
-                ALU_CASE(2, fne, fne);
-                ALU_CASE(2, flt, flt);
-                ALU_CASE(2, ieq, ieq);
-                ALU_CASE(2, ine, ine);
-                ALU_CASE(2, ilt, ilt);
-                //ALU_CASE(2, icsel, icsel);
-                ALU_CASE(0, frcp, frcp);
-                ALU_CASE(0, frsq, frsqrt);
-                ALU_CASE(0, fsqrt, fsqrt);
-                ALU_CASE(0, fexp2, fexp2);
-                ALU_CASE(0, flog2, flog2);
-
-                ALU_CASE(3, f2i32, f2i);
-                ALU_CASE(3, f2u32, f2u);
-                ALU_CASE(3, i2f32, i2f);
-                ALU_CASE(3, u2f32, u2f);
-
-                ALU_CASE(0, fsin, fsin);
-                ALU_CASE(0, fcos, fcos);
-
-                ALU_CASE(2, iand, iand);
-                ALU_CASE(2, ior, ior);
-                ALU_CASE(2, ixor, ixor);
-                ALU_CASE(0, inot, inot);
-                ALU_CASE(2, ishl, ishl);
-                ALU_CASE(2, ishr, iasr);
-                ALU_CASE(2, ushr, ilsr);
-                //ALU_CASE(2, ilsr, ilsr);
-
-                ALU_CASE(2, ball_fequal4, fball_eq);
-                ALU_CASE(2, bany_fnequal4, fbany_neq);
-                ALU_CASE(2, ball_iequal4, iball_eq);
-                ALU_CASE(2, bany_inequal4, ibany_neq);
+                ALU_CASE(imov, fmov);
+
+                ALU_CASE(feq, feq);
+                ALU_CASE(fne, fne);
+                ALU_CASE(flt, flt);
+                ALU_CASE(ieq, ieq);
+                ALU_CASE(ine, ine);
+                ALU_CASE(ilt, ilt);
+
+                ALU_CASE(frcp, frcp);
+                ALU_CASE(frsq, frsqrt);
+                ALU_CASE(fsqrt, fsqrt);
+                ALU_CASE(fexp2, fexp2);
+                ALU_CASE(flog2, flog2);
+
+                ALU_CASE(f2i32, f2i);
+                ALU_CASE(f2u32, f2u);
+                ALU_CASE(i2f32, i2f);
+                ALU_CASE(u2f32, u2f);
+
+                ALU_CASE(fsin, fsin);
+                ALU_CASE(fcos, fcos);
+
+                ALU_CASE(iand, iand);
+                ALU_CASE(ior, ior);
+                ALU_CASE(ixor, ixor);
+                ALU_CASE(inot, inot);
+                ALU_CASE(ishl, ishl);
+                ALU_CASE(ishr, iasr);
+                ALU_CASE(ushr, ilsr);
+
+                ALU_CASE(ball_fequal4, fball_eq);
+                ALU_CASE(bany_fnequal4, fbany_neq);
+                ALU_CASE(ball_iequal4, iball_eq);
+                ALU_CASE(bany_inequal4, ibany_neq);
 
         /* For greater-or-equal, we use less-or-equal and flip the
          * arguments */
 
         case nir_op_ige: {
-                components = 2;
                 op = midgard_alu_op_ile;
 
                 /* Swap via temporary */
@@ -993,9 +980,11 @@ emit_alu(compiler_context *ctx, nir_alu_instr *instr)
         }
 
         case nir_op_bcsel: {
-                components = 2;
                 op = midgard_alu_op_fcsel;
 
+                /* csel works as a two-arg in Midgard, since the condition is hardcoded in r31.w */
+                nr_inputs = 2;
+
                 emit_condition(ctx, &instr->src[0].src, false);
 
                 /* The condition is the first argument; move the other
@@ -1016,7 +1005,6 @@ emit_alu(compiler_context *ctx, nir_alu_instr *instr)
 
         case nir_op_b2f32: {
                 op = midgard_alu_op_iand;
-                components = 0;
                 break;
         }
 
@@ -1026,7 +1014,9 @@ emit_alu(compiler_context *ctx, nir_alu_instr *instr)
                 return;
         }
 
-        int _unit = alu_opcode_props[op];
+        /* Fetch unit, quirks, etc information */
+        unsigned opcode_props = alu_opcode_props[op];
+        bool quirk_flipped_r24 = opcode_props & QUIRK_FLIPPED_R24;
 
         /* Initialise fields common between scalar/vector instructions */
         midgard_outmod outmod = instr->dest.saturate ? midgard_outmod_sat : midgard_outmod_none;
@@ -1036,7 +1026,7 @@ emit_alu(compiler_context *ctx, nir_alu_instr *instr)
          * needs it, or else we may segfault. */
 
         unsigned src0 = nir_alu_src_index(&instr->src[0]);
-        unsigned src1 = components == 2 ? nir_alu_src_index(&instr->src[1]) : SSA_UNUSED_0;
+        unsigned src1 = nr_inputs == 2 ? nir_alu_src_index(&instr->src[1]) : SSA_UNUSED_0;
 
         /* Rather than use the instruction generation helpers, we do it
          * ourselves here to avoid the mess */
@@ -1044,23 +1034,22 @@ emit_alu(compiler_context *ctx, nir_alu_instr *instr)
         midgard_instruction ins = {
                 .type = TAG_ALU_4,
                 .ssa_args = {
-                        .src0 = components == 3 || components == 2 || components == 0 ? src0 : SSA_UNUSED_1,
-                        .src1 = components == 2 ? src1 : components == 1 ? src0 : components == 0 ? SSA_UNUSED_0 : SSA_UNUSED_1,
+                        .src0 = quirk_flipped_r24 ? SSA_UNUSED_1 : src0,
+                        .src1 = quirk_flipped_r24 ? src0         : src1,
                         .dest = dest,
-                        .inline_constant = components == 0
+                        .inline_constant = (nr_inputs == 1) && !quirk_flipped_r24
                 }
         };
 
-        nir_alu_src *nirmod0 = NULL;
-        nir_alu_src *nirmod1 = NULL;
+        nir_alu_src *nirmods[2] = { NULL };
 
-        if (components == 2) {
-                nirmod0 = &instr->src[0];
-                nirmod1 = &instr->src[1];
-        } else if (components == 1) {
-                nirmod1 = &instr->src[0];
-        } else if (components == 0) {
-                nirmod0 = &instr->src[0];
+        if (nr_inputs == 2) {
+                nirmods[0] = &instr->src[0];
+                nirmods[1] = &instr->src[1];
+        } else if (nr_inputs == 1) {
+                nirmods[quirk_flipped_r24] = &instr->src[0];
+        } else {
+                assert(0);
         }
 
         midgard_vector_alu alu = {
@@ -1072,8 +1061,8 @@ emit_alu(compiler_context *ctx, nir_alu_instr *instr)
                 /* Writemask only valid for non-SSA NIR */
                 .mask = expand_writemask((1 << nr_components) - 1),
 
-                .src1 = vector_alu_srco_unsigned(vector_alu_modifiers(nirmod0)),
-                .src2 = vector_alu_srco_unsigned(vector_alu_modifiers(nirmod1)),
+                .src1 = vector_alu_srco_unsigned(vector_alu_modifiers(nirmods[0])),
+                .src2 = vector_alu_srco_unsigned(vector_alu_modifiers(nirmods[1])),
         };
 
         /* Apply writemask if non-SSA, keeping in mind that we can't write to components that don't exist */
@@ -1097,23 +1086,21 @@ emit_alu(compiler_context *ctx, nir_alu_instr *instr)
                 ins.constants[0] = 1.0;
         }
 
-        if (_unit == UNIT_VLUT) {
-                /* To avoid duplicating the LUTs (we think?), LUT instructions can only
-                 * operate as if they were scalars. Lower them here by changing the
-                 * component. */
-
-                assert(components == 0);
+        if ((opcode_props & UNITS_ALL) == UNIT_VLUT) {
+                /* To avoid duplicating the lookup tables (probably), true LUT
+                 * instructions can only operate as if they were scalars. Lower
+                 * them here by changing the component. */
 
                 uint8_t original_swizzle[4];
-                memcpy(original_swizzle, nirmod0->swizzle, sizeof(nirmod0->swizzle));
+                memcpy(original_swizzle, nirmods[0]->swizzle, sizeof(nirmods[0]->swizzle));
 
                 for (int i = 0; i < nr_components; ++i) {
                         ins.alu.mask = (0x3) << (2 * i); /* Mask the associated component */
 
                         for (int j = 0; j < 4; ++j)
-                                nirmod0->swizzle[j] = original_swizzle[i]; /* Pull from the correct component */
+                                nirmods[0]->swizzle[j] = original_swizzle[i]; /* Pull from the correct component */
 
-                        ins.alu.src1 = vector_alu_srco_unsigned(vector_alu_modifiers(nirmod0));
+                        ins.alu.src1 = vector_alu_srco_unsigned(vector_alu_modifiers(nirmods[0]));
                         emit_mir_instruction(ctx, ins);
                 }
         } else {
@@ -1121,6 +1108,8 @@ emit_alu(compiler_context *ctx, nir_alu_instr *instr)
         }
 }
 
+#undef ALU_CASE
+
 static void
 emit_intrinsic(compiler_context *ctx, nir_intrinsic_instr *instr)
 {
-- 
2.7.4