From 99a9a5a345fab8bbf36ab4e42581f8ee04a59a63 Mon Sep 17 00:00:00 2001 From: Eric Anholt Date: Sat, 24 Oct 2015 17:35:03 -0700 Subject: [PATCH] vc4: Switch the unpack ops to being unpack flags on a mov. This paves the way for copy propagating our unpacks. We end up with a small change on shader-db: total instructions in shared programs: 89390 -> 89251 (-0.16%) instructions in affected programs: 19041 -> 18902 (-0.73%) which appears to be because we no longer convert MOVs for an FMAX dst, r4.unpack, r4.unpack (instead of the previous MOV dst, r4.unpack), and this ends up with a slightly better schedule. --- src/gallium/drivers/vc4/vc4_opt_algebraic.c | 1 + src/gallium/drivers/vc4/vc4_opt_small_immediates.c | 4 +- src/gallium/drivers/vc4/vc4_qir.c | 45 ++--------------- src/gallium/drivers/vc4/vc4_qir.h | 34 +++++-------- src/gallium/drivers/vc4/vc4_qpu_emit.c | 57 ++++------------------ src/gallium/drivers/vc4/vc4_register_allocate.c | 24 +++++---- 6 files changed, 42 insertions(+), 123 deletions(-) diff --git a/src/gallium/drivers/vc4/vc4_opt_algebraic.c b/src/gallium/drivers/vc4/vc4_opt_algebraic.c index 5b43583..f1bab81 100644 --- a/src/gallium/drivers/vc4/vc4_opt_algebraic.c +++ b/src/gallium/drivers/vc4/vc4_opt_algebraic.c @@ -64,6 +64,7 @@ is_constant_value(struct vc4_compile *c, struct qreg reg, uint32_t val) { if (reg.file == QFILE_UNIF && + !reg.pack && c->uniform_contents[reg.index] == QUNIFORM_CONSTANT && c->uniform_data[reg.index] == val) { return true; diff --git a/src/gallium/drivers/vc4/vc4_opt_small_immediates.c b/src/gallium/drivers/vc4/vc4_opt_small_immediates.c index d6e98f0..e615621 100644 --- a/src/gallium/drivers/vc4/vc4_opt_small_immediates.c +++ b/src/gallium/drivers/vc4/vc4_opt_small_immediates.c @@ -56,6 +56,7 @@ qir_opt_small_immediates(struct vc4_compile *c) struct qreg src = qir_follow_movs(c, inst->src[i]); if (src.file != QFILE_UNIF || + src.pack || c->uniform_contents[src.index] != QUNIFORM_CONSTANT) { continue; @@ -72,9 +73,6 @@ qir_opt_small_immediates(struct vc4_compile *c) continue; } - if (qir_src_needs_a_file(inst)) - continue; - uint32_t imm = c->uniform_data[src.index]; uint32_t small_imm = qpu_encode_small_immediate(imm); if (small_imm == ~0) diff --git a/src/gallium/drivers/vc4/vc4_qir.c b/src/gallium/drivers/vc4/vc4_qir.c index ce6618f..bba4f6d 100644 --- a/src/gallium/drivers/vc4/vc4_qir.c +++ b/src/gallium/drivers/vc4/vc4_qir.c @@ -36,6 +36,7 @@ struct qir_op_info { static const struct qir_op_info qir_op_info[] = { [QOP_MOV] = { "mov", 1, 1 }, + [QOP_FMOV] = { "fmov", 1, 1 }, [QOP_FADD] = { "fadd", 1, 2 }, [QOP_FSUB] = { "fsub", 1, 2 }, [QOP_FMUL] = { "fmul", 1, 2 }, @@ -100,18 +101,6 @@ static const struct qir_op_info qir_op_info[] = { [QOP_TEX_B] = { "tex_b", 0, 2 }, [QOP_TEX_DIRECT] = { "tex_direct", 0, 2 }, [QOP_TEX_RESULT] = { "tex_result", 1, 0, true }, - [QOP_UNPACK_8A_F] = { "unpack_8a_f", 1, 1 }, - [QOP_UNPACK_8B_F] = { "unpack_8b_f", 1, 1 }, - [QOP_UNPACK_8C_F] = { "unpack_8c_f", 1, 1 }, - [QOP_UNPACK_8D_F] = { "unpack_8d_f", 1, 1 }, - [QOP_UNPACK_16A_F] = { "unpack_16a_f", 1, 1 }, - [QOP_UNPACK_16B_F] = { "unpack_16b_f", 1, 1 }, - [QOP_UNPACK_8A_I] = { "unpack_8a_i", 1, 1 }, - [QOP_UNPACK_8B_I] = { "unpack_8b_i", 1, 1 }, - [QOP_UNPACK_8C_I] = { "unpack_8c_i", 1, 1 }, - [QOP_UNPACK_8D_I] = { "unpack_8d_i", 1, 1 }, - [QOP_UNPACK_16A_I] = { "unpack_16a_i", 1, 1 }, - [QOP_UNPACK_16B_I] = { "unpack_16b_i", 1, 1 }, }; static const char * @@ -193,6 +182,7 @@ bool qir_is_float_input(struct qinst *inst) { switch (inst->op) { + case QOP_FMOV: case QOP_FMUL: case QOP_FADD: case QOP_FSUB: @@ -201,12 +191,6 @@ qir_is_float_input(struct qinst *inst) case QOP_FMINABS: case QOP_FMAXABS: case QOP_FTOI: - case QOP_UNPACK_8A_F: - case QOP_UNPACK_8B_F: - case QOP_UNPACK_8C_F: - case QOP_UNPACK_8D_F: - case QOP_UNPACK_16A_F: - case QOP_UNPACK_16B_F: return true; default: return false; @@ -216,7 +200,8 @@ qir_is_float_input(struct qinst *inst) bool qir_is_raw_mov(struct qinst *inst) { - return (inst->op == QOP_MOV && + return ((inst->op == QOP_MOV || + inst->op == QOP_FMOV) && !inst->dst.pack && !inst->src[0].pack); } @@ -246,28 +231,6 @@ qir_depends_on_flags(struct qinst *inst) } bool -qir_src_needs_a_file(struct qinst *inst) -{ - switch (inst->op) { - case QOP_UNPACK_8A_F: - case QOP_UNPACK_8B_F: - case QOP_UNPACK_8C_F: - case QOP_UNPACK_8D_F: - case QOP_UNPACK_16A_F: - case QOP_UNPACK_16B_F: - case QOP_UNPACK_8A_I: - case QOP_UNPACK_8B_I: - case QOP_UNPACK_8C_I: - case QOP_UNPACK_8D_I: - case QOP_UNPACK_16A_I: - case QOP_UNPACK_16B_I: - return true; - default: - return false; - } -} - -bool qir_writes_r4(struct qinst *inst) { switch (inst->op) { diff --git a/src/gallium/drivers/vc4/vc4_qir.h b/src/gallium/drivers/vc4/vc4_qir.h index 1a1e0f3..393749b 100644 --- a/src/gallium/drivers/vc4/vc4_qir.h +++ b/src/gallium/drivers/vc4/vc4_qir.h @@ -37,6 +37,7 @@ #include "util/u_math.h" #include "vc4_screen.h" +#include "vc4_qpu_defines.h" #include "pipe/p_state.h" struct nir_builder; @@ -64,6 +65,7 @@ struct qreg { enum qop { QOP_UNDEF, QOP_MOV, + QOP_FMOV, QOP_FADD, QOP_FSUB, QOP_FMUL, @@ -128,20 +130,6 @@ enum qop { QOP_FRAG_W, QOP_FRAG_REV_FLAG, - QOP_UNPACK_8A_F, - QOP_UNPACK_8B_F, - QOP_UNPACK_8C_F, - QOP_UNPACK_8D_F, - QOP_UNPACK_16A_F, - QOP_UNPACK_16B_F, - - QOP_UNPACK_8A_I, - QOP_UNPACK_8B_I, - QOP_UNPACK_8C_I, - QOP_UNPACK_8D_I, - QOP_UNPACK_16A_I, - QOP_UNPACK_16B_I, - /** Texture x coordinate parameter write */ QOP_TEX_S, /** Texture y coordinate parameter write */ @@ -468,7 +456,6 @@ bool qir_is_tex(struct qinst *inst); bool qir_is_float_input(struct qinst *inst); bool qir_depends_on_flags(struct qinst *inst); bool qir_writes_r4(struct qinst *inst); -bool qir_src_needs_a_file(struct qinst *inst); struct qreg qir_follow_movs(struct vc4_compile *c, struct qreg reg); void qir_dump(struct vc4_compile *c); @@ -569,6 +556,7 @@ qir_##name(struct vc4_compile *c, struct qreg dest, struct qreg a) \ } QIR_ALU1(MOV) +QIR_ALU1(FMOV) QIR_ALU2(FADD) QIR_ALU2(FSUB) QIR_ALU2(FMUL) @@ -635,32 +623,32 @@ QIR_NODST_1(TLB_STENCIL_SETUP) static inline struct qreg qir_UNPACK_8_F(struct vc4_compile *c, struct qreg src, int i) { - struct qreg t = qir_get_temp(c); - qir_emit(c, qir_inst(QOP_UNPACK_8A_F + i, t, src, c->undef)); + struct qreg t = qir_FMOV(c, src); + c->defs[t.index]->src[0].pack = QPU_UNPACK_8A + i; return t; } static inline struct qreg qir_UNPACK_8_I(struct vc4_compile *c, struct qreg src, int i) { - struct qreg t = qir_get_temp(c); - qir_emit(c, qir_inst(QOP_UNPACK_8A_I + i, t, src, c->undef)); + struct qreg t = qir_MOV(c, src); + c->defs[t.index]->src[0].pack = QPU_UNPACK_8A + i; return t; } static inline struct qreg qir_UNPACK_16_F(struct vc4_compile *c, struct qreg src, int i) { - struct qreg t = qir_get_temp(c); - qir_emit(c, qir_inst(QOP_UNPACK_16A_F + i, t, src, c->undef)); + struct qreg t = qir_FMOV(c, src); + c->defs[t.index]->src[0].pack = QPU_UNPACK_16A + i; return t; } static inline struct qreg qir_UNPACK_16_I(struct vc4_compile *c, struct qreg src, int i) { - struct qreg t = qir_get_temp(c); - qir_emit(c, qir_inst(QOP_UNPACK_16A_I + i, t, src, c->undef)); + struct qreg t = qir_MOV(c, src); + c->defs[t.index]->src[0].pack = QPU_UNPACK_16A + i; return t; } diff --git a/src/gallium/drivers/vc4/vc4_qpu_emit.c b/src/gallium/drivers/vc4/vc4_qpu_emit.c index 94fd187..a3eff84 100644 --- a/src/gallium/drivers/vc4/vc4_qpu_emit.c +++ b/src/gallium/drivers/vc4/vc4_qpu_emit.c @@ -134,15 +134,6 @@ vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c) uint32_t vpm_read_fifo_count = 0; uint32_t vpm_read_offset = 0; int last_vpm_read_index = -1; - /* Map from the QIR ops enum order to QPU unpack bits. */ - static const uint32_t unpack_map[] = { - QPU_UNPACK_8A, - QPU_UNPACK_8B, - QPU_UNPACK_8C, - QPU_UNPACK_8D, - QPU_UNPACK_16A, - QPU_UNPACK_16B, - }; list_inithead(&c->qpu_inst_list); @@ -214,8 +205,10 @@ vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c) * out the same as a MOV. */ [QOP_MOV] = { QPU_A_OR }, + [QOP_FMOV] = { QPU_A_FMAX }, }; + uint64_t unpack = 0; struct qpu_reg src[4]; for (int i = 0; i < qir_get_op_nsrc(qinst->op); i++) { int index = qinst->src[i].index; @@ -225,6 +218,14 @@ vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c) break; case QFILE_TEMP: src[i] = temp_registers[index]; + if (qinst->src[i].pack) { + assert(!unpack || + unpack == qinst->src[i].pack); + unpack = QPU_SET_FIELD(qinst->src[i].pack, + QPU_UNPACK); + if (src[i].mux == QPU_MUX_R4) + unpack |= QPU_PM; + } break; case QFILE_UNIF: src[i] = qpu_unif(); @@ -426,44 +427,6 @@ vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c) queue(c, qpu_a_MOV(dst, qpu_r4())); break; - case QOP_UNPACK_8A_F: - case QOP_UNPACK_8B_F: - case QOP_UNPACK_8C_F: - case QOP_UNPACK_8D_F: - case QOP_UNPACK_16A_F: - case QOP_UNPACK_16B_F: - if (src[0].mux == QPU_MUX_R4) { - queue(c, qpu_a_MOV(dst, src[0])); - *last_inst(c) |= QPU_PM; - *last_inst(c) |= QPU_SET_FIELD(QPU_UNPACK_8A + - (qinst->op - - QOP_UNPACK_8A_F), - QPU_UNPACK); - } else { - assert(src[0].mux == QPU_MUX_A); - - queue(c, qpu_a_FMAX(dst, src[0], src[0])); - *last_inst(c) |= - QPU_SET_FIELD(unpack_map[qinst->op - - QOP_UNPACK_8A_F], - QPU_UNPACK); - } - break; - - case QOP_UNPACK_8A_I: - case QOP_UNPACK_8B_I: - case QOP_UNPACK_8C_I: - case QOP_UNPACK_8D_I: - case QOP_UNPACK_16A_I: - case QOP_UNPACK_16B_I: - assert(src[0].mux == QPU_MUX_A); - - queue(c, qpu_a_MOV(dst, src[0])); - *last_inst(c) |= QPU_SET_FIELD(unpack_map[qinst->op - - QOP_UNPACK_8A_I], - QPU_UNPACK); - break; - default: assert(qinst->op < ARRAY_SIZE(translate)); assert(translate[qinst->op].op != 0); /* NOPs */ diff --git a/src/gallium/drivers/vc4/vc4_register_allocate.c b/src/gallium/drivers/vc4/vc4_register_allocate.c index 2a1b6c35..bca36c3 100644 --- a/src/gallium/drivers/vc4/vc4_register_allocate.c +++ b/src/gallium/drivers/vc4/vc4_register_allocate.c @@ -282,17 +282,23 @@ vc4_register_allocate(struct vc4_context *vc4, struct vc4_compile *c) class_bits[inst->dst.index] &= CLASS_BIT_A; } - if (qir_src_needs_a_file(inst)) { - if (qir_is_float_input(inst)) { - /* Special case: these can be done as R4 - * unpacks, as well. - */ - class_bits[inst->src[0].index] &= (CLASS_BIT_A | - CLASS_BIT_R4); - } else { - class_bits[inst->src[0].index] &= CLASS_BIT_A; + /* Apply restrictions for src unpacks. The integer unpacks + * can only be done from regfile A, while float unpacks can be + * either A or R4. + */ + for (int i = 0; i < qir_get_op_nsrc(inst->op); i++) { + if (inst->src[i].file == QFILE_TEMP && + inst->src[i].pack) { + if (qir_is_float_input(inst)) { + class_bits[inst->src[i].index] &= + CLASS_BIT_A | CLASS_BIT_R4; + } else { + class_bits[inst->src[i].index] &= + CLASS_BIT_A; + } } } + ip++; } -- 2.7.4