panfrost/midgard: Handle csel correctly
authorAlyssa Rosenzweig <alyssa@rosenzweig.io>
Tue, 7 May 2019 02:52:08 +0000 (02:52 +0000)
committerAlyssa Rosenzweig <alyssa@rosenzweig.io>
Sun, 12 May 2019 22:21:49 +0000 (22:21 +0000)
We use an algebraic pass for the csel optimizations, and use proper
vectorized csel ops (i/fcsel_v) for mixed, rather lowering.

To avoid regressions along the way, we fix an issue with the copy
propagation pass (it should not attempt to propagate constants).
Similarly, we take care to break bundles when using csel to fix some
scheduler corner cases.

Signed-off-by: Alyssa Rosenzweig <alyssa@rosenzweig.io>
src/gallium/drivers/panfrost/ci/expected-failures.txt
src/gallium/drivers/panfrost/midgard/helpers.h
src/gallium/drivers/panfrost/midgard/midgard.h
src/gallium/drivers/panfrost/midgard/midgard_compile.c
src/gallium/drivers/panfrost/midgard/midgard_nir_algebraic.py

index f72aa07..0fe3342 100644 (file)
@@ -1872,9 +1872,6 @@ dEQP-GLES2.functional.shaders.random.all_features.fragment.74
 dEQP-GLES2.functional.shaders.random.all_features.fragment.77
 dEQP-GLES2.functional.shaders.random.all_features.fragment.95
 dEQP-GLES2.functional.shaders.random.all_features.vertex.17
-dEQP-GLES2.functional.shaders.random.conditionals.combined.88
-dEQP-GLES2.functional.shaders.random.conditionals.combined.92
-dEQP-GLES2.functional.shaders.random.conditionals.fragment.24
 dEQP-GLES2.functional.shaders.random.exponential.fragment.46
 dEQP-GLES2.functional.shaders.random.exponential.vertex.46
 dEQP-GLES2.functional.shaders.random.texture.fragment.1
index be2a45c..dc2de15 100644 (file)
@@ -196,7 +196,8 @@ static struct {
         [midgard_alu_op_ule]            = {"ule", UNITS_MOST},
 
         [midgard_alu_op_icsel]          = {"icsel", UNITS_ADD},
-        [midgard_alu_op_fcsel_i]        = {"fcsel_i", UNITS_ADD},
+        [midgard_alu_op_icsel_v]         = {"icsel_v", UNITS_ADD},
+        [midgard_alu_op_fcsel_v]        = {"fcsel_v", UNITS_ADD},
         [midgard_alu_op_fcsel]          = {"fcsel", UNITS_ADD | UNIT_SMUL},
 
         [midgard_alu_op_frcp]           = {"frcp", UNIT_VLUT},
index c280893..91d1c07 100644 (file)
@@ -138,8 +138,9 @@ typedef enum {
         midgard_alu_op_i2f        = 0xB8,
         midgard_alu_op_u2f        = 0xBC,
 
-        midgard_alu_op_icsel      = 0xC1,
-        midgard_alu_op_fcsel_i    = 0xC4,
+        midgard_alu_op_icsel_v    = 0xC0, /* condition code r31 */
+        midgard_alu_op_icsel      = 0xC1, /* condition code r31.w */
+        midgard_alu_op_fcsel_v    = 0xC4,
         midgard_alu_op_fcsel      = 0xC5,
         midgard_alu_op_fround     = 0xC6,
 
index 25316ca..4a26ba7 100644 (file)
@@ -138,6 +138,12 @@ typedef struct midgard_instruction {
         /* I.e. (1 << alu_bit) */
         int unit;
 
+        /* When emitting bundle, should this instruction have a break forced
+         * before it? Used for r31 writes which are valid only within a single
+         * bundle and *need* to happen as early as possible... this is a hack,
+         * TODO remove when we have a scheduler */
+        bool precede_break;
+
         bool has_constants;
         float constants[4];
         uint16_t inline_constant;
@@ -287,21 +293,6 @@ vector_alu_modifiers(nir_alu_src *src, bool is_int)
         return alu_src;
 }
 
-static bool
-mir_nontrivial_mod(midgard_vector_alu_src src, bool is_int, unsigned mask)
-{
-        /* abs or neg */
-        if (!is_int && src.mod) return true;
-
-        /* swizzle */
-        for (unsigned c = 0; c < 4; ++c) {
-                if (!(mask & (1 << c))) continue;
-                if (((src.swizzle >> (2*c)) & 3) != c) return true;
-        }
-
-        return false;
-}
-
 /* 'Intrinsic' move for misc aliasing uses independent of actual NIR ALU code */
 
 static midgard_instruction
@@ -715,58 +706,6 @@ midgard_nir_lower_fdot2_body(nir_builder *b, nir_alu_instr *alu)
         nir_ssa_def_rewrite_uses(&alu->dest.dest.ssa, nir_src_for_ssa(sum));
 }
 
-/* Lower csel with mixed condition channels to mulitple csel instructions. For
- * context, the csel ops on Midgard are vector in *outputs*, but not in
- * *conditions*. So, if the condition is e.g. yyyy, a single op can select a
- * vec4. But if the condition is e.g. xyzw, four ops are needed as the ISA
- * can't cope with the divergent channels.*/
-
-static void
-midgard_nir_lower_mixed_csel_body(nir_builder *b, nir_alu_instr *alu)
-{
-        if (alu->op != nir_op_bcsel)
-                return;
-
-        b->cursor = nir_before_instr(&alu->instr);
-
-        /* Must be run before registering */
-        assert(alu->dest.dest.is_ssa);
-
-        /* Check for mixed condition */
-
-        unsigned comp = alu->src[0].swizzle[0];
-        unsigned nr_components = alu->dest.dest.ssa.num_components;
-
-        bool mixed = false;
-
-        for (unsigned c = 1; c < nr_components; ++c)
-                mixed |= (alu->src[0].swizzle[c] != comp);
-
-        if (!mixed)
-                return;
-
-        /* We're mixed, so lower */
-
-        assert(nr_components <= 4);
-        nir_ssa_def *results[4];
-
-        nir_ssa_def *cond = nir_ssa_for_alu_src(b, alu, 0);
-        nir_ssa_def *choice0 = nir_ssa_for_alu_src(b, alu, 1);
-        nir_ssa_def *choice1 = nir_ssa_for_alu_src(b, alu, 2);
-
-        for (unsigned c = 0; c < nr_components; ++c) {
-                results[c] = nir_bcsel(b,
-                                nir_channel(b, cond, c),
-                                nir_channel(b, choice0, c),
-                                nir_channel(b, choice1, c));
-        }
-
-        /* Replace with our scalarized version */
-
-        nir_ssa_def *result = nir_vec(b, results, nr_components);
-        nir_ssa_def_rewrite_uses(&alu->dest.dest.ssa, nir_src_for_ssa(result));
-}
-
 static int
 midgard_nir_sysval_for_intrinsic(nir_intrinsic_instr *instr)
 {
@@ -851,36 +790,6 @@ midgard_nir_lower_fdot2(nir_shader *shader)
         return progress;
 }
 
-static bool
-midgard_nir_lower_mixed_csel(nir_shader *shader)
-{
-        bool progress = false;
-
-        nir_foreach_function(function, shader) {
-                if (!function->impl) continue;
-
-                nir_builder _b;
-                nir_builder *b = &_b;
-                nir_builder_init(b, function->impl);
-
-                nir_foreach_block(block, function->impl) {
-                        nir_foreach_instr_safe(instr, block) {
-                                if (instr->type != nir_instr_type_alu) continue;
-
-                                nir_alu_instr *alu = nir_instr_as_alu(instr);
-                                midgard_nir_lower_mixed_csel_body(b, alu);
-
-                                progress |= true;
-                        }
-                }
-
-                nir_metadata_preserve(function->impl, nir_metadata_block_index | nir_metadata_dominance);
-
-        }
-
-        return progress;
-}
-
 static void
 optimise_nir(nir_shader *nir)
 {
@@ -892,7 +801,6 @@ optimise_nir(nir_shader *nir)
 
         NIR_PASS(progress, nir, nir_lower_regs_to_ssa);
         NIR_PASS(progress, nir, midgard_nir_lower_fdot2);
-        NIR_PASS(progress, nir, midgard_nir_lower_mixed_csel);
 
         nir_lower_tex_options lower_tex_options = {
                 .lower_rect = true
@@ -957,6 +865,11 @@ optimise_nir(nir_shader *nir)
         } while (progress);
 
         NIR_PASS(progress, nir, nir_opt_algebraic_late);
+
+        /* We implement booleans as 32-bit 0/~0 */
+        NIR_PASS(progress, nir, nir_lower_bool_to_int32);
+
+        /* Now that booleans are lowered, we can run out late opts */
         NIR_PASS(progress, nir, midgard_nir_lower_algebraic_late);
 
         /* Lower mods for float ops only. Integer ops don't support modifiers
@@ -967,9 +880,6 @@ optimise_nir(nir_shader *nir)
         NIR_PASS(progress, nir, nir_copy_prop);
         NIR_PASS(progress, nir, nir_opt_dce);
 
-        /* We implement booleans as 32-bit 0/~0 */
-        NIR_PASS(progress, nir, nir_lower_bool_to_int32);
-
         /* Take us out of SSA */
         NIR_PASS(progress, nir, nir_lower_locals_to_regs);
         NIR_PASS(progress, nir, nir_convert_from_ssa, true);
@@ -1119,8 +1029,21 @@ nir_alu_src_index(compiler_context *ctx, nir_alu_src *src)
         return nir_src_index(ctx, &src->src);
 }
 
-/* Midgard puts conditionals in r31.w; move an arbitrary source (the output of
- * a conditional test) into that register */
+static bool
+nir_is_non_scalar_swizzle(nir_alu_src *src, unsigned nr_components)
+{
+        unsigned comp = src->swizzle[0];
+
+        for (unsigned c = 1; c < nr_components; ++c) {
+                if (src->swizzle[c] != comp)
+                        return true;
+        }
+
+        return false;
+}
+
+/* Midgard puts scalar conditionals in r31.w; move an arbitrary source (the
+ * output of a conditional test) into that register */
 
 static void
 emit_condition(compiler_context *ctx, nir_src *src, bool for_branch, unsigned component)
@@ -1138,8 +1061,13 @@ emit_condition(compiler_context *ctx, nir_src *src, bool for_branch, unsigned co
 
         midgard_instruction ins = {
                 .type = TAG_ALU_4,
-                .unit = for_branch ? UNIT_SMUL : UNIT_SADD, /* TODO: DEDUCE THIS */
+
+                /* We need to set the conditional as close as possible */
+                .precede_break = true,
+                .unit = for_branch ? UNIT_SMUL : UNIT_SADD,
+
                 .ssa_args = {
+
                         .src0 = condition,
                         .src1 = condition,
                         .dest = SSA_FIXED_REGISTER(31),
@@ -1157,6 +1085,46 @@ emit_condition(compiler_context *ctx, nir_src *src, bool for_branch, unsigned co
         emit_mir_instruction(ctx, ins);
 }
 
+/* Or, for mixed conditions (with csel_v), here's a vector version using all of
+ * r31 instead */
+
+static void
+emit_condition_mixed(compiler_context *ctx, nir_alu_src *src, unsigned nr_comp)
+{
+        int condition = nir_src_index(ctx, &src->src);
+
+        /* Source to swizzle the desired component into w */
+
+        const midgard_vector_alu_src alu_src = {
+                .swizzle = SWIZZLE_FROM_ARRAY(src->swizzle),
+        };
+
+        /* There is no boolean move instruction. Instead, we simulate a move by
+         * ANDing the condition with itself to get it into r31.w */
+
+        midgard_instruction ins = {
+                .type = TAG_ALU_4,
+                .precede_break = true,
+                .ssa_args = {
+                        .src0 = condition,
+                        .src1 = condition,
+                        .dest = SSA_FIXED_REGISTER(31),
+                },
+                .alu = {
+                        .op = midgard_alu_op_iand,
+                        .reg_mode = midgard_reg_mode_32,
+                        .dest_override = midgard_dest_override_none,
+                        .mask = expand_writemask((1 << nr_comp) - 1),
+                        .src1 = vector_alu_srco_unsigned(alu_src),
+                        .src2 = vector_alu_srco_unsigned(alu_src)
+                },
+        };
+
+        emit_mir_instruction(ctx, ins);
+}
+
+
+
 /* Likewise, indirect offsets are put in r27.w. TODO: Allow componentwise
  * pinning to eliminate this move in all known cases */
 
@@ -1189,7 +1157,6 @@ emit_indirect_offset(compiler_context *ctx, nir_src *src)
        case nir_op_##nir: \
                op = midgard_alu_op_##_op; \
                break;
-
 static bool
 nir_is_fzero_constant(nir_src src)
 {
@@ -1332,53 +1299,34 @@ emit_alu(compiler_context *ctx, nir_alu_instr *instr)
                 break;
         }
 
-        /* For a few special csel cases not handled by NIR, we can opt to
-         * bitwise. Otherwise, we emit the condition and do a real csel */
-
         case nir_op_b32csel: {
-                if (nir_is_fzero_constant(instr->src[2].src)) {
-                        /* (b ? v : 0) = (b & v) */
-                        op = midgard_alu_op_iand;
-                        nr_inputs = 2;
-                } else if (nir_is_fzero_constant(instr->src[1].src)) {
-                        /* (b ? 0 : v) = (!b ? v : 0) = (~b & v) = (v & ~b) */
-                        op = midgard_alu_op_iandnot;
-                        nr_inputs = 2;
-                        instr->src[1] = instr->src[0];
-                        instr->src[0] = instr->src[2];
-                } else {
-                        /* Midgard features both fcsel and icsel, depending on
-                         * the type of the arguments/output. However, as long
-                         * as we're careful we can _always_ use icsel and
-                         * _never_ need fcsel, since the latter does additional
-                         * floating-point-specific processing whereas the
-                         * former just moves bits on the wire. It's not obvious
-                         * why these are separate opcodes, save for the ability
-                         * to do things like sat/pos/abs/neg for free */
-
-                        op = midgard_alu_op_icsel;
+                /* Midgard features both fcsel and icsel, depending on
+                 * the type of the arguments/output. However, as long
+                 * as we're careful we can _always_ use icsel and
+                 * _never_ need fcsel, since the latter does additional
+                 * floating-point-specific processing whereas the
+                 * former just moves bits on the wire. It's not obvious
+                 * why these are separate opcodes, save for the ability
+                 * to do things like sat/pos/abs/neg for free */
 
-                        /* csel works as a two-arg in Midgard, since the condition is hardcoded in r31.w */
-                        nr_inputs = 2;
+                bool mixed = nir_is_non_scalar_swizzle(&instr->src[0], nr_components);
+                op = mixed ? midgard_alu_op_icsel_v : midgard_alu_op_icsel;
 
-                        /* Figure out which component the condition is in */
+                /* csel works as a two-arg in Midgard, since the condition is hardcoded in r31.w */
+                nr_inputs = 2;
 
-                        unsigned comp = instr->src[0].swizzle[0];
+                /* Emit the condition into r31 */
 
-                        /* Make sure NIR isn't throwing a mixed condition at us */
-
-                        for (unsigned c = 1; c < nr_components; ++c)
-                                assert(instr->src[0].swizzle[c] == comp);
-
-                        /* Emit the condition into r31.w */
-                        emit_condition(ctx, &instr->src[0].src, false, comp);
+                if (mixed)
+                        emit_condition_mixed(ctx, &instr->src[0], nr_components);
+                else
+                        emit_condition(ctx, &instr->src[0].src, false, instr->src[0].swizzle[0]);
 
-                        /* The condition is the first argument; move the other
-                         * arguments up one to be a binary instruction for
-                         * Midgard */
+                /* The condition is the first argument; move the other
+                 * arguments up one to be a binary instruction for
+                 * Midgard */
 
-                        memmove(instr->src, instr->src + 1, 2 * sizeof(nir_alu_src));
-                }
+                memmove(instr->src, instr->src + 1, 2 * sizeof(nir_alu_src));
                 break;
         }
 
@@ -2596,6 +2544,10 @@ schedule_bundle(compiler_context *ctx, midgard_block *block, midgard_instruction
                         /* Ensure that the chain can continue */
                         if (ains->type != TAG_ALU_4) break;
 
+                        /* If there's already something in the bundle and we
+                         * have weird scheduler constraints, break now */
+                        if (ains->precede_break && index) break;
+
                         /* According to the presentation "The ARM
                          * Mali-T880 Mobile GPU" from HotChips 27,
                          * there are two pipeline stages. Branching
@@ -3297,6 +3249,21 @@ midgard_opt_dead_code_eliminate(compiler_context *ctx, midgard_block *block)
 }
 
 static bool
+mir_nontrivial_mod(midgard_vector_alu_src src, bool is_int, unsigned mask)
+{
+        /* abs or neg */
+        if (!is_int && src.mod) return true;
+
+        /* swizzle */
+        for (unsigned c = 0; c < 4; ++c) {
+                if (!(mask & (1 << c))) continue;
+                if (((src.swizzle >> (2*c)) & 3) != c) return true;
+        }
+
+        return false;
+}
+
+static bool
 midgard_opt_copy_prop(compiler_context *ctx, midgard_block *block)
 {
         bool progress = false;
@@ -3315,6 +3282,10 @@ midgard_opt_copy_prop(compiler_context *ctx, midgard_block *block)
                 if (to >= ctx->func->impl->ssa_alloc) continue;
                 if (from >= ctx->func->impl->ssa_alloc) continue;
 
+                /* Constant propagation is not handled here, either */
+                if (ins->ssa_args.inline_constant) continue;
+                if (ins->has_constants) continue;
+
                 /* Also, if the move has side effects, we're helpless */
 
                 midgard_vector_alu_src src =
index 7c8cd06..df0caa2 100644 (file)
@@ -28,6 +28,7 @@ import math
 
 a = 'a'
 b = 'b'
+c = 'c'
 
 algebraic_late = [
     # ineg must be lowered late, but only for integers; floats will try to
@@ -35,8 +36,13 @@ algebraic_late = [
     # a more standard lower_negate approach
 
     (('ineg', a), ('isub', 0, a)),
-]
 
+    # These two special-cases save space/an op than the actual csel op +
+    # scheduler flexibility
+
+    (('b32csel', a, 'b@32', 0), ('iand', a, b)),
+    (('b32csel', a, 0, 'b@32'), ('iand', ('inot', a), b)),
+]
 
 # Midgard scales fsin/fcos arguments by pi.
 # Pass must be run only once, after the main loop