From 99a9a5a345fab8bbf36ab4e42581f8ee04a59a63 Mon Sep 17 00:00:00 2001
From: Eric Anholt <eric@anholt.net>
Date: Sat, 24 Oct 2015 17:35:03 -0700
Subject: [PATCH] vc4: Switch the unpack ops to being unpack flags on a mov.

This paves the way for copy propagating our unpacks.  We end up with a
small change on shader-db:

total instructions in shared programs: 89390 -> 89251 (-0.16%)
instructions in affected programs:     19041 -> 18902 (-0.73%)

which appears to be because we no longer convert MOVs for an FMAX dst,
r4.unpack, r4.unpack (instead of the previous MOV dst, r4.unpack), and
this ends up with a slightly better schedule.
---
 src/gallium/drivers/vc4/vc4_opt_algebraic.c        |  1 +
 src/gallium/drivers/vc4/vc4_opt_small_immediates.c |  4 +-
 src/gallium/drivers/vc4/vc4_qir.c                  | 45 ++---------------
 src/gallium/drivers/vc4/vc4_qir.h                  | 34 +++++--------
 src/gallium/drivers/vc4/vc4_qpu_emit.c             | 57 ++++------------------
 src/gallium/drivers/vc4/vc4_register_allocate.c    | 24 +++++----
 6 files changed, 42 insertions(+), 123 deletions(-)

diff --git a/src/gallium/drivers/vc4/vc4_opt_algebraic.c b/src/gallium/drivers/vc4/vc4_opt_algebraic.c
index 5b43583..f1bab81 100644
--- a/src/gallium/drivers/vc4/vc4_opt_algebraic.c
+++ b/src/gallium/drivers/vc4/vc4_opt_algebraic.c
@@ -64,6 +64,7 @@ is_constant_value(struct vc4_compile *c, struct qreg reg,
                   uint32_t val)
 {
         if (reg.file == QFILE_UNIF &&
+            !reg.pack &&
             c->uniform_contents[reg.index] == QUNIFORM_CONSTANT &&
             c->uniform_data[reg.index] == val) {
                 return true;
diff --git a/src/gallium/drivers/vc4/vc4_opt_small_immediates.c b/src/gallium/drivers/vc4/vc4_opt_small_immediates.c
index d6e98f0..e615621 100644
--- a/src/gallium/drivers/vc4/vc4_opt_small_immediates.c
+++ b/src/gallium/drivers/vc4/vc4_opt_small_immediates.c
@@ -56,6 +56,7 @@ qir_opt_small_immediates(struct vc4_compile *c)
                         struct qreg src = qir_follow_movs(c, inst->src[i]);
 
                         if (src.file != QFILE_UNIF ||
+                            src.pack ||
                             c->uniform_contents[src.index] !=
                             QUNIFORM_CONSTANT) {
                                 continue;
@@ -72,9 +73,6 @@ qir_opt_small_immediates(struct vc4_compile *c)
                                 continue;
                         }
 
-                        if (qir_src_needs_a_file(inst))
-                                continue;
-
                         uint32_t imm = c->uniform_data[src.index];
                         uint32_t small_imm = qpu_encode_small_immediate(imm);
                         if (small_imm == ~0)
diff --git a/src/gallium/drivers/vc4/vc4_qir.c b/src/gallium/drivers/vc4/vc4_qir.c
index ce6618f..bba4f6d 100644
--- a/src/gallium/drivers/vc4/vc4_qir.c
+++ b/src/gallium/drivers/vc4/vc4_qir.c
@@ -36,6 +36,7 @@ struct qir_op_info {
 
 static const struct qir_op_info qir_op_info[] = {
         [QOP_MOV] = { "mov", 1, 1 },
+        [QOP_FMOV] = { "fmov", 1, 1 },
         [QOP_FADD] = { "fadd", 1, 2 },
         [QOP_FSUB] = { "fsub", 1, 2 },
         [QOP_FMUL] = { "fmul", 1, 2 },
@@ -100,18 +101,6 @@ static const struct qir_op_info qir_op_info[] = {
         [QOP_TEX_B] = { "tex_b", 0, 2 },
         [QOP_TEX_DIRECT] = { "tex_direct", 0, 2 },
         [QOP_TEX_RESULT] = { "tex_result", 1, 0, true },
-        [QOP_UNPACK_8A_F] = { "unpack_8a_f", 1, 1 },
-        [QOP_UNPACK_8B_F] = { "unpack_8b_f", 1, 1 },
-        [QOP_UNPACK_8C_F] = { "unpack_8c_f", 1, 1 },
-        [QOP_UNPACK_8D_F] = { "unpack_8d_f", 1, 1 },
-        [QOP_UNPACK_16A_F] = { "unpack_16a_f", 1, 1 },
-        [QOP_UNPACK_16B_F] = { "unpack_16b_f", 1, 1 },
-        [QOP_UNPACK_8A_I] = { "unpack_8a_i", 1, 1 },
-        [QOP_UNPACK_8B_I] = { "unpack_8b_i", 1, 1 },
-        [QOP_UNPACK_8C_I] = { "unpack_8c_i", 1, 1 },
-        [QOP_UNPACK_8D_I] = { "unpack_8d_i", 1, 1 },
-        [QOP_UNPACK_16A_I] = { "unpack_16a_i", 1, 1 },
-        [QOP_UNPACK_16B_I] = { "unpack_16b_i", 1, 1 },
 };
 
 static const char *
@@ -193,6 +182,7 @@ bool
 qir_is_float_input(struct qinst *inst)
 {
         switch (inst->op) {
+        case QOP_FMOV:
         case QOP_FMUL:
         case QOP_FADD:
         case QOP_FSUB:
@@ -201,12 +191,6 @@ qir_is_float_input(struct qinst *inst)
         case QOP_FMINABS:
         case QOP_FMAXABS:
         case QOP_FTOI:
-        case QOP_UNPACK_8A_F:
-        case QOP_UNPACK_8B_F:
-        case QOP_UNPACK_8C_F:
-        case QOP_UNPACK_8D_F:
-        case QOP_UNPACK_16A_F:
-        case QOP_UNPACK_16B_F:
                 return true;
         default:
                 return false;
@@ -216,7 +200,8 @@ qir_is_float_input(struct qinst *inst)
 bool
 qir_is_raw_mov(struct qinst *inst)
 {
-        return (inst->op == QOP_MOV &&
+        return ((inst->op == QOP_MOV ||
+                 inst->op == QOP_FMOV) &&
                 !inst->dst.pack &&
                 !inst->src[0].pack);
 }
@@ -246,28 +231,6 @@ qir_depends_on_flags(struct qinst *inst)
 }
 
 bool
-qir_src_needs_a_file(struct qinst *inst)
-{
-        switch (inst->op) {
-        case QOP_UNPACK_8A_F:
-        case QOP_UNPACK_8B_F:
-        case QOP_UNPACK_8C_F:
-        case QOP_UNPACK_8D_F:
-        case QOP_UNPACK_16A_F:
-        case QOP_UNPACK_16B_F:
-        case QOP_UNPACK_8A_I:
-        case QOP_UNPACK_8B_I:
-        case QOP_UNPACK_8C_I:
-        case QOP_UNPACK_8D_I:
-        case QOP_UNPACK_16A_I:
-        case QOP_UNPACK_16B_I:
-                return true;
-        default:
-                return false;
-        }
-}
-
-bool
 qir_writes_r4(struct qinst *inst)
 {
         switch (inst->op) {
diff --git a/src/gallium/drivers/vc4/vc4_qir.h b/src/gallium/drivers/vc4/vc4_qir.h
index 1a1e0f3..393749b 100644
--- a/src/gallium/drivers/vc4/vc4_qir.h
+++ b/src/gallium/drivers/vc4/vc4_qir.h
@@ -37,6 +37,7 @@
 #include "util/u_math.h"
 
 #include "vc4_screen.h"
+#include "vc4_qpu_defines.h"
 #include "pipe/p_state.h"
 
 struct nir_builder;
@@ -64,6 +65,7 @@ struct qreg {
 enum qop {
         QOP_UNDEF,
         QOP_MOV,
+        QOP_FMOV,
         QOP_FADD,
         QOP_FSUB,
         QOP_FMUL,
@@ -128,20 +130,6 @@ enum qop {
         QOP_FRAG_W,
         QOP_FRAG_REV_FLAG,
 
-        QOP_UNPACK_8A_F,
-        QOP_UNPACK_8B_F,
-        QOP_UNPACK_8C_F,
-        QOP_UNPACK_8D_F,
-        QOP_UNPACK_16A_F,
-        QOP_UNPACK_16B_F,
-
-        QOP_UNPACK_8A_I,
-        QOP_UNPACK_8B_I,
-        QOP_UNPACK_8C_I,
-        QOP_UNPACK_8D_I,
-        QOP_UNPACK_16A_I,
-        QOP_UNPACK_16B_I,
-
         /** Texture x coordinate parameter write */
         QOP_TEX_S,
         /** Texture y coordinate parameter write */
@@ -468,7 +456,6 @@ bool qir_is_tex(struct qinst *inst);
 bool qir_is_float_input(struct qinst *inst);
 bool qir_depends_on_flags(struct qinst *inst);
 bool qir_writes_r4(struct qinst *inst);
-bool qir_src_needs_a_file(struct qinst *inst);
 struct qreg qir_follow_movs(struct vc4_compile *c, struct qreg reg);
 
 void qir_dump(struct vc4_compile *c);
@@ -569,6 +556,7 @@ qir_##name(struct vc4_compile *c, struct qreg dest, struct qreg a)       \
 }
 
 QIR_ALU1(MOV)
+QIR_ALU1(FMOV)
 QIR_ALU2(FADD)
 QIR_ALU2(FSUB)
 QIR_ALU2(FMUL)
@@ -635,32 +623,32 @@ QIR_NODST_1(TLB_STENCIL_SETUP)
 static inline struct qreg
 qir_UNPACK_8_F(struct vc4_compile *c, struct qreg src, int i)
 {
-        struct qreg t = qir_get_temp(c);
-        qir_emit(c, qir_inst(QOP_UNPACK_8A_F + i, t, src, c->undef));
+        struct qreg t = qir_FMOV(c, src);
+        c->defs[t.index]->src[0].pack = QPU_UNPACK_8A + i;
         return t;
 }
 
 static inline struct qreg
 qir_UNPACK_8_I(struct vc4_compile *c, struct qreg src, int i)
 {
-        struct qreg t = qir_get_temp(c);
-        qir_emit(c, qir_inst(QOP_UNPACK_8A_I + i, t, src, c->undef));
+        struct qreg t = qir_MOV(c, src);
+        c->defs[t.index]->src[0].pack = QPU_UNPACK_8A + i;
         return t;
 }
 
 static inline struct qreg
 qir_UNPACK_16_F(struct vc4_compile *c, struct qreg src, int i)
 {
-        struct qreg t = qir_get_temp(c);
-        qir_emit(c, qir_inst(QOP_UNPACK_16A_F + i, t, src, c->undef));
+        struct qreg t = qir_FMOV(c, src);
+        c->defs[t.index]->src[0].pack = QPU_UNPACK_16A + i;
         return t;
 }
 
 static inline struct qreg
 qir_UNPACK_16_I(struct vc4_compile *c, struct qreg src, int i)
 {
-        struct qreg t = qir_get_temp(c);
-        qir_emit(c, qir_inst(QOP_UNPACK_16A_I + i, t, src, c->undef));
+        struct qreg t = qir_MOV(c, src);
+        c->defs[t.index]->src[0].pack = QPU_UNPACK_16A + i;
         return t;
 }
 
diff --git a/src/gallium/drivers/vc4/vc4_qpu_emit.c b/src/gallium/drivers/vc4/vc4_qpu_emit.c
index 94fd187..a3eff84 100644
--- a/src/gallium/drivers/vc4/vc4_qpu_emit.c
+++ b/src/gallium/drivers/vc4/vc4_qpu_emit.c
@@ -134,15 +134,6 @@ vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c)
         uint32_t vpm_read_fifo_count = 0;
         uint32_t vpm_read_offset = 0;
         int last_vpm_read_index = -1;
-        /* Map from the QIR ops enum order to QPU unpack bits. */
-        static const uint32_t unpack_map[] = {
-                QPU_UNPACK_8A,
-                QPU_UNPACK_8B,
-                QPU_UNPACK_8C,
-                QPU_UNPACK_8D,
-                QPU_UNPACK_16A,
-                QPU_UNPACK_16B,
-        };
 
         list_inithead(&c->qpu_inst_list);
 
@@ -214,8 +205,10 @@ vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c)
                          * out the same as a MOV.
                          */
                         [QOP_MOV] = { QPU_A_OR },
+                        [QOP_FMOV] = { QPU_A_FMAX },
                 };
 
+                uint64_t unpack = 0;
                 struct qpu_reg src[4];
                 for (int i = 0; i < qir_get_op_nsrc(qinst->op); i++) {
                         int index = qinst->src[i].index;
@@ -225,6 +218,14 @@ vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c)
                                 break;
                         case QFILE_TEMP:
                                 src[i] = temp_registers[index];
+                                if (qinst->src[i].pack) {
+                                        assert(!unpack ||
+                                               unpack == qinst->src[i].pack);
+                                        unpack = QPU_SET_FIELD(qinst->src[i].pack,
+                                                               QPU_UNPACK);
+                                        if (src[i].mux == QPU_MUX_R4)
+                                                unpack |= QPU_PM;
+                                }
                                 break;
                         case QFILE_UNIF:
                                 src[i] = qpu_unif();
@@ -426,44 +427,6 @@ vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c)
                                 queue(c, qpu_a_MOV(dst, qpu_r4()));
                         break;
 
-                case QOP_UNPACK_8A_F:
-                case QOP_UNPACK_8B_F:
-                case QOP_UNPACK_8C_F:
-                case QOP_UNPACK_8D_F:
-                case QOP_UNPACK_16A_F:
-                case QOP_UNPACK_16B_F:
-                        if (src[0].mux == QPU_MUX_R4) {
-                                queue(c, qpu_a_MOV(dst, src[0]));
-                                *last_inst(c) |= QPU_PM;
-                                *last_inst(c) |= QPU_SET_FIELD(QPU_UNPACK_8A +
-                                                               (qinst->op -
-                                                                QOP_UNPACK_8A_F),
-                                                               QPU_UNPACK);
-                        } else {
-                                assert(src[0].mux == QPU_MUX_A);
-
-                                queue(c, qpu_a_FMAX(dst, src[0], src[0]));
-                                *last_inst(c) |=
-                                        QPU_SET_FIELD(unpack_map[qinst->op -
-                                                                 QOP_UNPACK_8A_F],
-                                                      QPU_UNPACK);
-                        }
-                        break;
-
-                case QOP_UNPACK_8A_I:
-                case QOP_UNPACK_8B_I:
-                case QOP_UNPACK_8C_I:
-                case QOP_UNPACK_8D_I:
-                case QOP_UNPACK_16A_I:
-                case QOP_UNPACK_16B_I:
-                        assert(src[0].mux == QPU_MUX_A);
-
-                        queue(c, qpu_a_MOV(dst, src[0]));
-                        *last_inst(c) |= QPU_SET_FIELD(unpack_map[qinst->op -
-                                                                  QOP_UNPACK_8A_I],
-                                                       QPU_UNPACK);
-                        break;
-
                 default:
                         assert(qinst->op < ARRAY_SIZE(translate));
                         assert(translate[qinst->op].op != 0); /* NOPs */
diff --git a/src/gallium/drivers/vc4/vc4_register_allocate.c b/src/gallium/drivers/vc4/vc4_register_allocate.c
index 2a1b6c35..bca36c3 100644
--- a/src/gallium/drivers/vc4/vc4_register_allocate.c
+++ b/src/gallium/drivers/vc4/vc4_register_allocate.c
@@ -282,17 +282,23 @@ vc4_register_allocate(struct vc4_context *vc4, struct vc4_compile *c)
                         class_bits[inst->dst.index] &= CLASS_BIT_A;
                 }
 
-                if (qir_src_needs_a_file(inst)) {
-                        if (qir_is_float_input(inst)) {
-                                /* Special case: these can be done as R4
-                                 * unpacks, as well.
-                                 */
-                                class_bits[inst->src[0].index] &= (CLASS_BIT_A |
-                                                                   CLASS_BIT_R4);
-                        } else {
-                                class_bits[inst->src[0].index] &= CLASS_BIT_A;
+                /* Apply restrictions for src unpacks.  The integer unpacks
+                 * can only be done from regfile A, while float unpacks can be
+                 * either A or R4.
+                 */
+                for (int i = 0; i < qir_get_op_nsrc(inst->op); i++) {
+                        if (inst->src[i].file == QFILE_TEMP &&
+                            inst->src[i].pack) {
+                                if (qir_is_float_input(inst)) {
+                                        class_bits[inst->src[i].index] &=
+                                                CLASS_BIT_A | CLASS_BIT_R4;
+                                } else {
+                                        class_bits[inst->src[i].index] &=
+                                                CLASS_BIT_A;
+                                }
                         }
                 }
+
                 ip++;
         }
 
-- 
2.7.4