From f8a8de8b9a69fc6f4a8fc86a71f81c168cdd18b0 Mon Sep 17 00:00:00 2001 From: Eric Anholt Date: Thu, 3 Jan 2019 12:13:18 -0800 Subject: [PATCH] v3d: Do UBO loads a vector at a time. In the process of adding support for SSBOs and CS shared vars, I ended up needing a helper function for doing TMU general ops. This helper can be that starting point, and saves us a bunch of round-trips to the TMU by loading a vector at a time. --- src/broadcom/compiler/nir_to_vir.c | 124 ++++++++++++++++++++++++++----------- src/broadcom/compiler/vir.c | 10 +++ 2 files changed, 99 insertions(+), 35 deletions(-) diff --git a/src/broadcom/compiler/nir_to_vir.c b/src/broadcom/compiler/nir_to_vir.c index f209918..91d4ab0 100644 --- a/src/broadcom/compiler/nir_to_vir.c +++ b/src/broadcom/compiler/nir_to_vir.c @@ -32,6 +32,40 @@ #include "common/v3d_device_info.h" #include "v3d_compiler.h" +#define GENERAL_TMU_LOOKUP_PER_QUAD (0 << 7) +#define GENERAL_TMU_LOOKUP_PER_PIXEL (1 << 7) +#define GENERAL_TMU_READ_OP_PREFETCH (0 << 3) +#define GENERAL_TMU_READ_OP_CACHE_CLEAR (1 << 3) +#define GENERAL_TMU_READ_OP_CACHE_FLUSH (3 << 3) +#define GENERAL_TMU_READ_OP_CACHE_CLEAN (3 << 3) +#define GENERAL_TMU_READ_OP_CACHE_L1T_CLEAR (4 << 3) +#define GENERAL_TMU_READ_OP_CACHE_L1T_FLUSH_AGGREGATION (5 << 3) +#define GENERAL_TMU_READ_OP_ATOMIC_INC (8 << 3) +#define GENERAL_TMU_READ_OP_ATOMIC_DEC (9 << 3) +#define GENERAL_TMU_READ_OP_ATOMIC_NOT (10 << 3) +#define GENERAL_TMU_READ_OP_READ (15 << 3) +#define GENERAL_TMU_LOOKUP_TYPE_8BIT_I (0 << 0) +#define GENERAL_TMU_LOOKUP_TYPE_16BIT_I (1 << 0) +#define GENERAL_TMU_LOOKUP_TYPE_VEC2 (2 << 0) +#define GENERAL_TMU_LOOKUP_TYPE_VEC3 (3 << 0) +#define GENERAL_TMU_LOOKUP_TYPE_VEC4 (4 << 0) +#define GENERAL_TMU_LOOKUP_TYPE_8BIT_UI (5 << 0) +#define GENERAL_TMU_LOOKUP_TYPE_16BIT_UI (6 << 0) +#define GENERAL_TMU_LOOKUP_TYPE_32BIT_UI (7 << 0) + +#define GENERAL_TMU_WRITE_OP_ATOMIC_ADD_WRAP (0 << 3) +#define GENERAL_TMU_WRITE_OP_ATOMIC_SUB_WRAP (1 << 3) +#define GENERAL_TMU_WRITE_OP_ATOMIC_XCHG (2 << 3) +#define GENERAL_TMU_WRITE_OP_ATOMIC_CMPXCHG (3 << 3) +#define GENERAL_TMU_WRITE_OP_ATOMIC_UMIN (4 << 3) +#define GENERAL_TMU_WRITE_OP_ATOMIC_UMAX (5 << 3) +#define GENERAL_TMU_WRITE_OP_ATOMIC_SMIN (6 << 3) +#define GENERAL_TMU_WRITE_OP_ATOMIC_SMAX (7 << 3) +#define GENERAL_TMU_WRITE_OP_ATOMIC_AND (8 << 3) +#define GENERAL_TMU_WRITE_OP_ATOMIC_OR (9 << 3) +#define GENERAL_TMU_WRITE_OP_ATOMIC_XOR (10 << 3) +#define GENERAL_TMU_WRITE_OP_WRITE (15 << 3) + static void ntq_emit_cf_list(struct v3d_compile *c, struct exec_list *list); @@ -73,6 +107,60 @@ vir_emit_thrsw(struct v3d_compile *c) c->last_thrsw_at_top_level = (c->execute.file == QFILE_NULL); } +/** + * Implements indirect uniform loads through the TMU general memory access + * interface. + */ +static void +ntq_emit_tmu_general(struct v3d_compile *c, nir_intrinsic_instr *instr) +{ + uint32_t tmu_op = GENERAL_TMU_READ_OP_READ; + bool has_index = instr->intrinsic == nir_intrinsic_load_ubo; + int offset_src = 0 + has_index; + + /* Note that QUNIFORM_UBO_ADDR takes a UBO index shifted up by + * 1 (0 is gallium's constant buffer 0). + */ + struct qreg offset = vir_uniform(c, QUNIFORM_UBO_ADDR, + nir_src_as_uint(instr->src[0]) + 1); + + uint32_t config = (0xffffff00 | + tmu_op | + GENERAL_TMU_LOOKUP_PER_PIXEL); + if (instr->num_components == 1) { + config |= GENERAL_TMU_LOOKUP_TYPE_32BIT_UI; + } else { + config |= (GENERAL_TMU_LOOKUP_TYPE_VEC2 + + instr->num_components - 2); + } + + struct qreg dest; + if (config == ~0) + dest = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUA); + else + dest = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUAU); + + struct qinst *tmu; + if (nir_src_is_const(instr->src[offset_src]) && + nir_src_as_uint(instr->src[offset_src]) == 0) { + tmu = vir_MOV_dest(c, dest, offset); + } else { + tmu = vir_ADD_dest(c, dest, + offset, + ntq_get_src(c, instr->src[offset_src], 0)); + } + + if (config != ~0) { + tmu->src[vir_get_implicit_uniform_src(tmu)] = + vir_uniform_ui(c, config); + } + + vir_emit_thrsw(c); + + for (int i = 0; i < nir_intrinsic_dest_components(instr); i++) + ntq_store_dest(c, &instr->dest, i, vir_MOV(c, vir_LDTMU(c))); +} + static struct qreg indirect_uniform_load(struct v3d_compile *c, nir_intrinsic_instr *intr) { @@ -1547,41 +1635,7 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr) break; case nir_intrinsic_load_ubo: - for (int i = 0; i < instr->num_components; i++) { - int ubo = nir_src_as_uint(instr->src[0]); - - /* XXX perf: On V3D 4.x with uniform offsets, we - * should probably try setting UBOs up in the A - * register file and doing a sequence of loads that - * way. - */ - /* Adjust for where we stored the TGSI register base. */ - vir_ADD_dest(c, - vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUA), - vir_uniform(c, QUNIFORM_UBO_ADDR, 1 + ubo), - vir_ADD(c, - ntq_get_src(c, instr->src[1], 0), - vir_uniform_ui(c, i * 4))); - - vir_emit_thrsw(c); - - ntq_store_dest(c, &instr->dest, i, vir_LDTMU(c)); - } - break; - - if (nir_src_is_const(instr->src[0])) { - offset = (nir_intrinsic_base(instr) + - nir_src_as_uint(instr->src[0])); - assert(offset % 4 == 0); - /* We need dwords */ - offset = offset / 4; - ntq_store_dest(c, &instr->dest, 0, - vir_uniform(c, QUNIFORM_UNIFORM, - offset)); - } else { - ntq_store_dest(c, &instr->dest, 0, - indirect_uniform_load(c, instr)); - } + ntq_emit_tmu_general(c, instr); break; case nir_intrinsic_load_user_clip_plane: diff --git a/src/broadcom/compiler/vir.c b/src/broadcom/compiler/vir.c index 2f32359..6eb346c 100644 --- a/src/broadcom/compiler/vir.c +++ b/src/broadcom/compiler/vir.c @@ -61,6 +61,16 @@ vir_has_implicit_uniform(struct qinst *inst) switch (inst->dst.file) { case QFILE_TLBU: return true; + case QFILE_MAGIC: + switch (inst->dst.index) { + case V3D_QPU_WADDR_TLBU: + case V3D_QPU_WADDR_TMUAU: + case V3D_QPU_WADDR_SYNCU: + return true; + default: + break; + } + break; default: return inst->has_implicit_uniform; } -- 2.7.4