From d4c20e82ae34b105fb2d06c8c412656aba2ca1b9 Mon Sep 17 00:00:00 2001 From: Eric Anholt Date: Tue, 15 Nov 2016 12:54:26 -0800 Subject: [PATCH] vc4: Restructure texture insts as ALU ops with tex_[strb] as the dst. For now we're still just generating MOVs, but this will let us fold into other ops in the future. No difference on shader-db. --- src/gallium/drivers/vc4/vc4_opt_algebraic.c | 15 +++-- src/gallium/drivers/vc4/vc4_opt_small_immediates.c | 7 +-- src/gallium/drivers/vc4/vc4_program.c | 34 +++++++--- src/gallium/drivers/vc4/vc4_qir.c | 72 +++++++++++++++++++--- src/gallium/drivers/vc4/vc4_qir.h | 40 +++++------- src/gallium/drivers/vc4/vc4_qir_lower_uniforms.c | 2 +- src/gallium/drivers/vc4/vc4_qir_schedule.c | 50 ++++++++------- src/gallium/drivers/vc4/vc4_qir_validate.c | 24 ++++++++ src/gallium/drivers/vc4/vc4_qpu_emit.c | 39 +++++++----- 9 files changed, 194 insertions(+), 89 deletions(-) diff --git a/src/gallium/drivers/vc4/vc4_opt_algebraic.c b/src/gallium/drivers/vc4/vc4_opt_algebraic.c index 01ad05d..5e7d269 100644 --- a/src/gallium/drivers/vc4/vc4_opt_algebraic.c +++ b/src/gallium/drivers/vc4/vc4_opt_algebraic.c @@ -94,14 +94,17 @@ static void replace_with_mov(struct vc4_compile *c, struct qinst *inst, struct qreg arg) { dump_from(c, inst); + + inst->src[0] = arg; + if (qir_has_implicit_tex_uniform(inst)) + inst->src[1] = inst->src[qir_get_tex_uniform_src(inst)]; + if (qir_is_mul(inst)) inst->op = QOP_MMOV; else if (qir_is_float_input(inst)) inst->op = QOP_FMOV; else inst->op = QOP_MOV; - inst->src[0] = arg; - inst->src[1] = c->undef; dump_to(c, inst); } @@ -172,8 +175,12 @@ qir_opt_algebraic(struct vc4_compile *c) break; case QOP_ADD: - if (replace_x_0_with_x(c, inst, 0) || - replace_x_0_with_x(c, inst, 1)) { + /* Kernel validation requires that we use an actual + * add instruction. + */ + if (inst->dst.file != QFILE_TEX_S_DIRECT && + (replace_x_0_with_x(c, inst, 0) || + replace_x_0_with_x(c, inst, 1))) { progress = true; break; } diff --git a/src/gallium/drivers/vc4/vc4_opt_small_immediates.c b/src/gallium/drivers/vc4/vc4_opt_small_immediates.c index 15cbd12..89c4857 100644 --- a/src/gallium/drivers/vc4/vc4_opt_small_immediates.c +++ b/src/gallium/drivers/vc4/vc4_opt_small_immediates.c @@ -62,11 +62,8 @@ qir_opt_small_immediates(struct vc4_compile *c) continue; } - if (i == 1 && - (inst->op == QOP_TEX_S || - inst->op == QOP_TEX_T || - inst->op == QOP_TEX_R || - inst->op == QOP_TEX_B)) { + if (qir_is_tex(inst) && + i == qir_get_tex_uniform_src(inst)) { /* No turning the implicit uniform read into * an immediate. */ diff --git a/src/gallium/drivers/vc4/vc4_program.c b/src/gallium/drivers/vc4/vc4_program.c index 97cbabb..66fd902 100644 --- a/src/gallium/drivers/vc4/vc4_program.c +++ b/src/gallium/drivers/vc4/vc4_program.c @@ -120,7 +120,10 @@ indirect_uniform_load(struct vc4_compile *c, nir_intrinsic_instr *intr) qir_uniform_ui(c, (range->dst_offset + range->size - 4))); - qir_TEX_DIRECT(c, indirect_offset, qir_uniform(c, QUNIFORM_UBO_ADDR, 0)); + qir_ADD_dest(c, qir_reg(QFILE_TEX_S_DIRECT, 0), + indirect_offset, + qir_uniform(c, QUNIFORM_UBO_ADDR, 0)); + c->num_texture_samples++; ntq_emit_thrsw(c); @@ -381,7 +384,8 @@ ntq_emit_txf(struct vc4_compile *c, nir_tex_instr *instr) addr = qir_MAX(c, addr, qir_uniform_ui(c, 0)); addr = qir_MIN(c, addr, qir_uniform_ui(c, size - 4)); - qir_TEX_DIRECT(c, addr, qir_uniform(c, QUNIFORM_TEXTURE_MSAA_ADDR, unit)); + qir_ADD_dest(c, qir_reg(QFILE_TEX_S_DIRECT, 0), + addr, qir_uniform(c, QUNIFORM_TEXTURE_MSAA_ADDR, unit)); ntq_emit_thrsw(c); @@ -479,14 +483,20 @@ ntq_emit_tex(struct vc4_compile *c, nir_tex_instr *instr) unit | (is_txl << 16)); } + struct qinst *tmu; if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE) { - qir_TEX_R(c, r, texture_u[next_texture_u++]); + tmu = qir_MOV_dest(c, qir_reg(QFILE_TEX_R, 0), r); + tmu->src[qir_get_tex_uniform_src(tmu)] = + texture_u[next_texture_u++]; } else if (c->key->tex[unit].wrap_s == PIPE_TEX_WRAP_CLAMP_TO_BORDER || c->key->tex[unit].wrap_s == PIPE_TEX_WRAP_CLAMP || c->key->tex[unit].wrap_t == PIPE_TEX_WRAP_CLAMP_TO_BORDER || c->key->tex[unit].wrap_t == PIPE_TEX_WRAP_CLAMP) { - qir_TEX_R(c, qir_uniform(c, QUNIFORM_TEXTURE_BORDER_COLOR, unit), - texture_u[next_texture_u++]); + tmu = qir_MOV_dest(c, qir_reg(QFILE_TEX_R, 0), + qir_uniform(c, QUNIFORM_TEXTURE_BORDER_COLOR, + unit)); + tmu->src[qir_get_tex_uniform_src(tmu)] = + texture_u[next_texture_u++]; } if (c->key->tex[unit].wrap_s == PIPE_TEX_WRAP_CLAMP) { @@ -497,12 +507,18 @@ ntq_emit_tex(struct vc4_compile *c, nir_tex_instr *instr) t = qir_SAT(c, t); } - qir_TEX_T(c, t, texture_u[next_texture_u++]); + tmu = qir_MOV_dest(c, qir_reg(QFILE_TEX_T, 0), t); + tmu->src[qir_get_tex_uniform_src(tmu)] = + texture_u[next_texture_u++]; - if (is_txl || is_txb) - qir_TEX_B(c, lod, texture_u[next_texture_u++]); + if (is_txl || is_txb) { + tmu = qir_MOV_dest(c, qir_reg(QFILE_TEX_B, 0), lod); + tmu->src[qir_get_tex_uniform_src(tmu)] = + texture_u[next_texture_u++]; + } - qir_TEX_S(c, s, texture_u[next_texture_u++]); + tmu = qir_MOV_dest(c, qir_reg(QFILE_TEX_S, 0), s); + tmu->src[qir_get_tex_uniform_src(tmu)] = texture_u[next_texture_u++]; c->num_texture_samples++; diff --git a/src/gallium/drivers/vc4/vc4_qir.c b/src/gallium/drivers/vc4/vc4_qir.c index 2c9119d..7c556a9 100644 --- a/src/gallium/drivers/vc4/vc4_qir.c +++ b/src/gallium/drivers/vc4/vc4_qir.c @@ -75,11 +75,6 @@ static const struct qir_op_info qir_op_info[] = { [QOP_FRAG_Z] = { "frag_z", 1, 0 }, [QOP_FRAG_W] = { "frag_w", 1, 0 }, - [QOP_TEX_S] = { "tex_s", 0, 2, true }, - [QOP_TEX_T] = { "tex_t", 0, 2, true }, - [QOP_TEX_R] = { "tex_r", 0, 2, true }, - [QOP_TEX_B] = { "tex_b", 0, 2, true }, - [QOP_TEX_DIRECT] = { "tex_direct", 0, 2, true }, [QOP_TEX_RESULT] = { "tex_result", 1, 0, true }, [QOP_THRSW] = { "thrsw", 0, 0, true }, @@ -104,12 +99,37 @@ qir_get_op_name(enum qop qop) } int -qir_get_nsrc(struct qinst *inst) +qir_get_non_sideband_nsrc(struct qinst *inst) { assert(qir_op_info[inst->op].name); return qir_op_info[inst->op].nsrc; } +int +qir_get_nsrc(struct qinst *inst) +{ + assert(qir_op_info[inst->op].name); + + int nsrc = qir_get_non_sideband_nsrc(inst); + + /* Normal (non-direct) texture coordinate writes also implicitly load + * a uniform for the texture parameters. + */ + if (qir_is_tex(inst) && inst->dst.file != QFILE_TEX_S_DIRECT) + nsrc++; + + return nsrc; +} + +/* The sideband uniform for textures gets stored after the normal ALU + * arguments. + */ +int +qir_get_tex_uniform_src(struct qinst *inst) +{ + return qir_get_nsrc(inst) - 1; +} + /** * Returns whether the instruction has any side effects that must be * preserved. @@ -122,6 +142,11 @@ qir_has_side_effects(struct vc4_compile *c, struct qinst *inst) case QFILE_TLB_COLOR_WRITE: case QFILE_TLB_COLOR_WRITE_MS: case QFILE_TLB_STENCIL_SETUP: + case QFILE_TEX_S_DIRECT: + case QFILE_TEX_S: + case QFILE_TEX_T: + case QFILE_TEX_R: + case QFILE_TEX_B: return true; default: break; @@ -206,7 +231,30 @@ qir_is_raw_mov(struct qinst *inst) bool qir_is_tex(struct qinst *inst) { - return inst->op >= QOP_TEX_S && inst->op <= QOP_TEX_DIRECT; + switch (inst->dst.file) { + case QFILE_TEX_S_DIRECT: + case QFILE_TEX_S: + case QFILE_TEX_T: + case QFILE_TEX_R: + case QFILE_TEX_B: + return true; + default: + return false; + } +} + +bool +qir_has_implicit_tex_uniform(struct qinst *inst) +{ + switch (inst->dst.file) { + case QFILE_TEX_S: + case QFILE_TEX_T: + case QFILE_TEX_R: + case QFILE_TEX_B: + return true; + default: + return false; + } } bool @@ -298,6 +346,11 @@ qir_print_reg(struct vc4_compile *c, struct qreg reg, bool write) [QFILE_FRAG_Y] = "frag_y", [QFILE_FRAG_REV_FLAG] = "frag_rev_flag", [QFILE_QPU_ELEMENT] = "elem", + [QFILE_TEX_S_DIRECT] = "tex_s_direct", + [QFILE_TEX_S] = "tex_s", + [QFILE_TEX_T] = "tex_t", + [QFILE_TEX_R] = "tex_r", + [QFILE_TEX_B] = "tex_b", }; switch (reg.file) { @@ -330,6 +383,11 @@ qir_print_reg(struct vc4_compile *c, struct qreg reg, bool write) case QFILE_TLB_COLOR_WRITE_MS: case QFILE_TLB_Z_WRITE: case QFILE_TLB_STENCIL_SETUP: + case QFILE_TEX_S_DIRECT: + case QFILE_TEX_S: + case QFILE_TEX_T: + case QFILE_TEX_R: + case QFILE_TEX_B: fprintf(stderr, "%s", files[reg.file]); break; diff --git a/src/gallium/drivers/vc4/vc4_qir.h b/src/gallium/drivers/vc4/vc4_qir.h index a3b8762..99cc957 100644 --- a/src/gallium/drivers/vc4/vc4_qir.h +++ b/src/gallium/drivers/vc4/vc4_qir.h @@ -55,6 +55,18 @@ enum qfile { QFILE_TLB_Z_WRITE, QFILE_TLB_STENCIL_SETUP, + /* If tex_s is written on its own without preceding t/r/b setup, it's + * a direct memory access using the input value, without the sideband + * uniform load. We represent these in QIR as a separate write + * destination so we can tell if the sideband uniform is present. + */ + QFILE_TEX_S_DIRECT, + + QFILE_TEX_S, + QFILE_TEX_T, + QFILE_TEX_R, + QFILE_TEX_B, + /* Payload registers that aren't in the physical register file, so we * can just use the corresponding qpu_reg at qpu_emit time. */ @@ -132,24 +144,6 @@ enum qop { QOP_FRAG_Z, QOP_FRAG_W, - /** Texture x coordinate parameter write */ - QOP_TEX_S, - /** Texture y coordinate parameter write */ - QOP_TEX_T, - /** Texture border color parameter or cube map z coordinate write */ - QOP_TEX_R, - /** Texture LOD bias parameter write */ - QOP_TEX_B, - - /** - * Texture-unit 4-byte read with address provided direct in S - * cooordinate. - * - * The first operand is the offset from the start of the UBO, and the - * second is the uniform that has the UBO's base pointer. - */ - QOP_TEX_DIRECT, - /** * Signal of texture read being necessary and then reading r4 into * the destination @@ -203,7 +197,7 @@ struct qinst { enum qop op; struct qreg dst; - struct qreg src[2]; + struct qreg src[3]; bool sf; bool cond_is_exec_mask; uint8_t cond; @@ -578,12 +572,15 @@ struct qinst *qir_emit_nondef(struct vc4_compile *c, struct qinst *inst); struct qreg qir_get_temp(struct vc4_compile *c); void qir_calculate_live_intervals(struct vc4_compile *c); int qir_get_nsrc(struct qinst *inst); +int qir_get_non_sideband_nsrc(struct qinst *inst); +int qir_get_tex_uniform_src(struct qinst *inst); bool qir_reg_equals(struct qreg a, struct qreg b); bool qir_has_side_effects(struct vc4_compile *c, struct qinst *inst); bool qir_has_side_effect_reads(struct vc4_compile *c, struct qinst *inst); bool qir_is_mul(struct qinst *inst); bool qir_is_raw_mov(struct qinst *inst); bool qir_is_tex(struct qinst *inst); +bool qir_has_implicit_tex_uniform(struct qinst *inst); bool qir_is_float_input(struct qinst *inst); bool qir_depends_on_flags(struct qinst *inst); bool qir_writes_r4(struct qinst *inst); @@ -737,11 +734,6 @@ QIR_ALU1(RSQ) QIR_ALU1(EXP2) QIR_ALU1(LOG2) QIR_ALU1(VARY_ADD_C) -QIR_NODST_2(TEX_S) -QIR_NODST_2(TEX_T) -QIR_NODST_2(TEX_R) -QIR_NODST_2(TEX_B) -QIR_NODST_2(TEX_DIRECT) QIR_PAYLOAD(FRAG_Z) QIR_PAYLOAD(FRAG_W) QIR_ALU0(TEX_RESULT) diff --git a/src/gallium/drivers/vc4/vc4_qir_lower_uniforms.c b/src/gallium/drivers/vc4/vc4_qir_lower_uniforms.c index 1884cfa..9ecfe65 100644 --- a/src/gallium/drivers/vc4/vc4_qir_lower_uniforms.c +++ b/src/gallium/drivers/vc4/vc4_qir_lower_uniforms.c @@ -77,7 +77,7 @@ is_lowerable_uniform(struct qinst *inst, int i) if (inst->src[i].file != QFILE_UNIF) return false; if (qir_is_tex(inst)) - return i != 1; + return i != qir_get_tex_uniform_src(inst); return true; } diff --git a/src/gallium/drivers/vc4/vc4_qir_schedule.c b/src/gallium/drivers/vc4/vc4_qir_schedule.c index c1a2db5..a8ef189 100644 --- a/src/gallium/drivers/vc4/vc4_qir_schedule.c +++ b/src/gallium/drivers/vc4/vc4_qir_schedule.c @@ -212,18 +212,6 @@ calculate_deps(struct schedule_setup_state *state, struct schedule_node *n) add_dep(dir, state->last_vary_read, n); break; - case QOP_TEX_S: - case QOP_TEX_T: - case QOP_TEX_R: - case QOP_TEX_B: - case QOP_TEX_DIRECT: - /* Texturing setup gets scheduled in order, because - * the uniforms referenced by them have to land in a - * specific order. - */ - add_write_dep(dir, &state->last_tex_coord, n); - break; - case QOP_TEX_RESULT: /* Results have to be fetched in order. */ add_write_dep(dir, &state->last_tex_result, n); @@ -278,6 +266,18 @@ calculate_deps(struct schedule_setup_state *state, struct schedule_node *n) add_write_dep(dir, &state->last_tlb, n); break; + case QFILE_TEX_S_DIRECT: + case QFILE_TEX_S: + case QFILE_TEX_T: + case QFILE_TEX_R: + case QFILE_TEX_B: + /* Texturing setup gets scheduled in order, because + * the uniforms referenced by them have to land in a + * specific order. + */ + add_write_dep(dir, &state->last_tex_coord, n); + break; + default: break; } @@ -315,12 +315,12 @@ calculate_forward_deps(struct vc4_compile *c, void *mem_ctx, } } - switch (inst->op) { - case QOP_TEX_S: - case QOP_TEX_T: - case QOP_TEX_R: - case QOP_TEX_B: - case QOP_TEX_DIRECT: + switch (inst->dst.file) { + case QFILE_TEX_S_DIRECT: + case QFILE_TEX_S: + case QFILE_TEX_T: + case QFILE_TEX_R: + case QFILE_TEX_B: /* From the VC4 spec: * * "The TFREQ input FIFO holds two full lots of s, @@ -364,8 +364,8 @@ calculate_forward_deps(struct vc4_compile *c, void *mem_ctx, * If the texture result fifo is full, block adding * any more to it until the last QOP_TEX_RESULT. */ - if (inst->op == QOP_TEX_S || - inst->op == QOP_TEX_DIRECT) { + if (inst->dst.file == QFILE_TEX_S || + inst->dst.file == QFILE_TEX_S_DIRECT) { if (state.tfrcv_count == (c->fs_threaded ? 2 : 4)) block_until_tex_result(&state, n); @@ -376,6 +376,11 @@ calculate_forward_deps(struct vc4_compile *c, void *mem_ctx, state.tfreq_count++; break; + default: + break; + } + + switch (inst->op) { case QOP_TEX_RESULT: /* Results have to be fetched after the * coordinate setup. Note that we're assuming @@ -398,7 +403,6 @@ calculate_forward_deps(struct vc4_compile *c, void *mem_ctx, break; default: - assert(!qir_is_tex(inst)); break; } } @@ -560,8 +564,8 @@ dump_state(struct vc4_compile *c, struct schedule_state *state) static uint32_t latency_between(struct schedule_node *before, struct schedule_node *after) { - if ((before->inst->op == QOP_TEX_S || - before->inst->op == QOP_TEX_DIRECT) && + if ((before->inst->dst.file == QFILE_TEX_S || + before->inst->dst.file == QFILE_TEX_S_DIRECT) && after->inst->op == QOP_TEX_RESULT) return 100; diff --git a/src/gallium/drivers/vc4/vc4_qir_validate.c b/src/gallium/drivers/vc4/vc4_qir_validate.c index 9579f7a..302eb48 100644 --- a/src/gallium/drivers/vc4/vc4_qir_validate.c +++ b/src/gallium/drivers/vc4/vc4_qir_validate.c @@ -84,6 +84,25 @@ void qir_validate(struct vc4_compile *c) case QFILE_LOAD_IMM: fail_instr(c, inst, "Bad dest file"); break; + + case QFILE_TEX_S: + case QFILE_TEX_T: + case QFILE_TEX_R: + case QFILE_TEX_B: + if (inst->src[qir_get_tex_uniform_src(inst)].file != + QFILE_UNIF) { + fail_instr(c, inst, + "tex op missing implicit uniform"); + } + break; + + case QFILE_TEX_S_DIRECT: + if (inst->op != QOP_ADD) { + fail_instr(c, inst, + "kernel validation requires that " + "direct texture lookups use an ADD"); + } + break; } for (int i = 0; i < qir_get_nsrc(inst); i++) { @@ -119,6 +138,11 @@ void qir_validate(struct vc4_compile *c) case QFILE_TLB_COLOR_WRITE_MS: case QFILE_TLB_Z_WRITE: case QFILE_TLB_STENCIL_SETUP: + case QFILE_TEX_S_DIRECT: + case QFILE_TEX_S: + case QFILE_TEX_T: + case QFILE_TEX_R: + case QFILE_TEX_B: fail_instr(c, inst, "Bad src file"); break; } diff --git a/src/gallium/drivers/vc4/vc4_qpu_emit.c b/src/gallium/drivers/vc4/vc4_qpu_emit.c index 9d9e5d8..47fc0b0 100644 --- a/src/gallium/drivers/vc4/vc4_qpu_emit.c +++ b/src/gallium/drivers/vc4/vc4_qpu_emit.c @@ -347,6 +347,11 @@ vc4_generate_code_block(struct vc4_compile *c, case QFILE_TLB_COLOR_WRITE_MS: case QFILE_TLB_Z_WRITE: case QFILE_TLB_STENCIL_SETUP: + case QFILE_TEX_S: + case QFILE_TEX_S_DIRECT: + case QFILE_TEX_T: + case QFILE_TEX_R: + case QFILE_TEX_B: unreachable("bad qir src file"); } } @@ -379,6 +384,23 @@ vc4_generate_code_block(struct vc4_compile *c, dst = qpu_ra(QPU_W_TLB_STENCIL_SETUP); break; + case QFILE_TEX_S: + case QFILE_TEX_S_DIRECT: + dst = qpu_rb(QPU_W_TMU0_S); + break; + + case QFILE_TEX_T: + dst = qpu_rb(QPU_W_TMU0_T); + break; + + case QFILE_TEX_R: + dst = qpu_rb(QPU_W_TMU0_R); + break; + + case QFILE_TEX_B: + dst = qpu_rb(QPU_W_TMU0_B); + break; + case QFILE_VARY: case QFILE_UNIF: case QFILE_SMALL_IMM: @@ -477,21 +499,6 @@ vc4_generate_code_block(struct vc4_compile *c, queue(block, qpu_a_FADD(dst, src[0], qpu_r5()) | unpack); break; - case QOP_TEX_S: - case QOP_TEX_T: - case QOP_TEX_R: - case QOP_TEX_B: - queue(block, qpu_a_MOV(qpu_rb(QPU_W_TMU0_S + - (qinst->op - QOP_TEX_S)), - src[0]) | unpack); - break; - - case QOP_TEX_DIRECT: - fixup_raddr_conflict(block, dst, &src[0], &src[1], - qinst, &unpack); - queue(block, qpu_a_ADD(qpu_rb(QPU_W_TMU0_S), - src[0], src[1]) | unpack); - break; case QOP_TEX_RESULT: queue(block, qpu_NOP()); @@ -538,7 +545,7 @@ vc4_generate_code_block(struct vc4_compile *c, * argument slot as well so that we don't take up * another raddr just to get unused data. */ - if (qir_get_nsrc(qinst) == 1) + if (qir_get_non_sideband_nsrc(qinst) == 1) src[1] = src[0]; fixup_raddr_conflict(block, dst, &src[0], &src[1], -- 2.7.4