From 2ce449fa7d7829b24dd0292d866aaf925937a048 Mon Sep 17 00:00:00 2001 From: Rob Clark Date: Mon, 17 Apr 2017 11:40:16 -0400 Subject: [PATCH] freedreno/ir3: compute shader support Signed-off-by: Rob Clark --- src/gallium/drivers/freedreno/disasm.h | 1 + .../drivers/freedreno/ir3/ir3_compiler_nir.c | 81 +++++++++++++++++----- src/gallium/drivers/freedreno/ir3/ir3_shader.c | 76 +++++++++++++++++++- src/gallium/drivers/freedreno/ir3/ir3_shader.h | 15 +++- 4 files changed, 154 insertions(+), 19 deletions(-) diff --git a/src/gallium/drivers/freedreno/disasm.h b/src/gallium/drivers/freedreno/disasm.h index bac1215..579dd50 100644 --- a/src/gallium/drivers/freedreno/disasm.h +++ b/src/gallium/drivers/freedreno/disasm.h @@ -31,6 +31,7 @@ enum shader_t { SHADER_GEOM, SHADER_FRAGMENT, SHADER_COMPUTE, + SHADER_MAX, }; /* bitmask of debug flags */ diff --git a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c index a164675..46d0525 100644 --- a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c +++ b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c @@ -71,6 +71,9 @@ struct ir3_compile { /* For vertex shaders, keep track of the system values sources */ struct ir3_instruction *vertex_id, *basevertex, *instance_id; + /* Compute shader inputs: */ + struct ir3_instruction *local_invocation_id, *work_group_id; + /* For SSBO's and atomics, we need to preserve order, such * that reads don't overtake writes, and the order of writes * is preserved. Atomics are considered as a write. @@ -228,15 +231,21 @@ compile_init(struct ir3_compiler *compiler, constoff += align(ctx->s->info->num_ubos * ptrsz, 4) / 4; } + unsigned num_driver_params = 0; if (so->type == SHADER_VERTEX) { - so->constbase.driver_param = constoff; - constoff += align(IR3_DP_COUNT, 4) / 4; + num_driver_params = IR3_DP_VS_COUNT; + } else if (so->type == SHADER_COMPUTE) { + num_driver_params = IR3_DP_CS_COUNT; + } - if ((compiler->gpu_id < 500) && - so->shader->stream_output.num_outputs > 0) { - so->constbase.tfbo = constoff; - constoff += align(PIPE_MAX_SO_BUFFERS * ptrsz, 4) / 4; - } + so->constbase.driver_param = constoff; + constoff += align(num_driver_params, 4) / 4; + + if ((so->type == SHADER_VERTEX) && + (compiler->gpu_id < 500) && + so->shader->stream_output.num_outputs > 0) { + so->constbase.tfbo = constoff; + constoff += align(PIPE_MAX_SO_BUFFERS * ptrsz, 4) / 4; } so->constbase.immediate = constoff; @@ -538,7 +547,7 @@ create_var_store(struct ir3_compile *ctx, struct ir3_array *arr, int n, } static struct ir3_instruction * -create_input(struct ir3_block *block, unsigned n) +create_input_compmask(struct ir3_block *block, unsigned n, unsigned compmask) { struct ir3_instruction *in; @@ -546,10 +555,18 @@ create_input(struct ir3_block *block, unsigned n) in->inout.block = block; ir3_reg_create(in, n, 0); + in->regs[0]->wrmask = compmask; + return in; } static struct ir3_instruction * +create_input(struct ir3_block *block, unsigned n) +{ + return create_input_compmask(block, n, 0x1); +} + +static struct ir3_instruction * create_frag_input(struct ir3_compile *ctx, bool use_ldlv) { struct ir3_block *block = ctx->block; @@ -1309,7 +1326,8 @@ emit_intrinsic_atomic(struct ir3_compile *ctx, nir_intrinsic_instr *intr) array_insert(b, b->keeps, atomic); } -static void add_sysval_input(struct ir3_compile *ctx, gl_system_value slot, +static void add_sysval_input_compmask(struct ir3_compile *ctx, + gl_system_value slot, unsigned compmask, struct ir3_instruction *instr) { struct ir3_shader_variant *so = ctx->so; @@ -1318,7 +1336,7 @@ static void add_sysval_input(struct ir3_compile *ctx, gl_system_value slot, so->inputs[n].sysval = true; so->inputs[n].slot = slot; - so->inputs[n].compmask = 1; + so->inputs[n].compmask = compmask; so->inputs[n].regid = r; so->inputs[n].interpolate = INTERP_MODE_FLAT; so->total_in++; @@ -1327,6 +1345,12 @@ static void add_sysval_input(struct ir3_compile *ctx, gl_system_value slot, ctx->ir->inputs[r] = instr; } +static void add_sysval_input(struct ir3_compile *ctx, gl_system_value slot, + struct ir3_instruction *instr) +{ + add_sysval_input_compmask(ctx, slot, 0x1, instr); +} + static void emit_intrinsic(struct ir3_compile *ctx, nir_intrinsic_instr *intr) { @@ -1476,6 +1500,28 @@ emit_intrinsic(struct ir3_compile *ctx, nir_intrinsic_instr *intr) dst[0] = ir3_COV(b, ctx->frag_face, TYPE_S16, TYPE_S32); dst[0] = ir3_ADD_S(b, dst[0], 0, create_immed(b, 1), 0); break; + case nir_intrinsic_load_local_invocation_id: + if (!ctx->local_invocation_id) { + ctx->local_invocation_id = create_input_compmask(b, 0, 0x7); + add_sysval_input_compmask(ctx, SYSTEM_VALUE_LOCAL_INVOCATION_ID, + 0x7, ctx->local_invocation_id); + } + split_dest(b, dst, ctx->local_invocation_id, 0, 3); + break; + case nir_intrinsic_load_work_group_id: + if (!ctx->work_group_id) { + ctx->work_group_id = create_input_compmask(b, 0, 0x7); + add_sysval_input_compmask(ctx, SYSTEM_VALUE_WORK_GROUP_ID, + 0x7, ctx->work_group_id); + ctx->work_group_id->regs[0]->flags |= IR3_REG_HIGH; + } + split_dest(b, dst, ctx->work_group_id, 0, 3); + break; + case nir_intrinsic_load_num_work_groups: + for (int i = 0; i < intr->num_components; i++) { + dst[i] = create_driver_param(ctx, IR3_DP_NUM_WORK_GROUPS_X + i); + } + break; case nir_intrinsic_discard_if: case nir_intrinsic_discard: { struct ir3_instruction *cond, *kill; @@ -2381,6 +2427,11 @@ max_drvloc(struct exec_list *vars) return drvloc; } +static const unsigned max_sysvals[SHADER_MAX] = { + [SHADER_VERTEX] = 16, + [SHADER_COMPUTE] = 16, // TODO how many do we actually need? +}; + static void emit_instructions(struct ir3_compile *ctx) { @@ -2390,11 +2441,9 @@ emit_instructions(struct ir3_compile *ctx) ninputs = (max_drvloc(&ctx->s->inputs) + 1) * 4; noutputs = (max_drvloc(&ctx->s->outputs) + 1) * 4; - /* or vtx shaders, we need to leave room for sysvals: + /* we need to leave room for sysvals: */ - if (ctx->so->type == SHADER_VERTEX) { - ninputs += 16; - } + ninputs += max_sysvals[ctx->so->type]; ctx->ir = ir3_create(ctx->compiler, ninputs, noutputs); @@ -2403,9 +2452,7 @@ emit_instructions(struct ir3_compile *ctx) ctx->in_block = ctx->block; list_addtail(&ctx->block->node, &ctx->ir->block_list); - if (ctx->so->type == SHADER_VERTEX) { - ctx->ir->ninputs -= 16; - } + ninputs -= max_sysvals[ctx->so->type]; /* for fragment shader, we have a single input register (usually * r0.xy) which is used as the base for bary.f varying fetch instrs: diff --git a/src/gallium/drivers/freedreno/ir3/ir3_shader.c b/src/gallium/drivers/freedreno/ir3/ir3_shader.c index 9133317..4ca96ce 100644 --- a/src/gallium/drivers/freedreno/ir3/ir3_shader.c +++ b/src/gallium/drivers/freedreno/ir3/ir3_shader.c @@ -291,6 +291,7 @@ ir3_shader_create(struct ir3_compiler *compiler, /* we take ownership of the reference: */ nir = cso->ir.nir; } else { + debug_assert(cso->type == PIPE_SHADER_IR_TGSI); if (fd_mesa_debug & FD_DBG_DISASM) { DBG("dump tgsi: type=%d", shader->type); tgsi_dump(cso->tokens, 0); @@ -317,6 +318,43 @@ ir3_shader_create(struct ir3_compiler *compiler, return shader; } +/* a bit annoying that compute-shader and normal shader state objects + * aren't a bit more aligned. + */ +struct ir3_shader * +ir3_shader_create_compute(struct ir3_compiler *compiler, + const struct pipe_compute_state *cso, + struct pipe_debug_callback *debug) +{ + struct ir3_shader *shader = CALLOC_STRUCT(ir3_shader); + + shader->compiler = compiler; + shader->id = ++shader->compiler->shader_count; + shader->type = SHADER_COMPUTE; + + nir_shader *nir; + if (cso->ir_type == PIPE_SHADER_IR_NIR) { + /* we take ownership of the reference: */ + nir = (nir_shader *)cso->prog; + } else { + debug_assert(cso->ir_type == PIPE_SHADER_IR_TGSI); + if (fd_mesa_debug & FD_DBG_DISASM) { + DBG("dump tgsi: type=%d", shader->type); + tgsi_dump(cso->prog, 0); + } + nir = ir3_tgsi_to_nir(cso->prog); + } + + /* do first pass optimization, ignoring the key: */ + shader->nir = ir3_optimize_nir(shader, nir, NULL); + if (fd_mesa_debug & FD_DBG_DISASM) { + DBG("dump nir%d: type=%d", shader->id, shader->type); + nir_print_shader(shader->nir, stdout); + } + + return shader; +} + static void dump_reg(const char *name, uint32_t r) { if (r != regid(63,0)) @@ -684,7 +722,7 @@ ir3_emit_vs_consts(const struct ir3_shader_variant *v, struct fd_ringbuffer *rin if (info) { uint32_t offset = v->constbase.driver_param; if (v->constlen > offset) { - uint32_t vertex_params[IR3_DP_COUNT] = { + uint32_t vertex_params[IR3_DP_VS_COUNT] = { [IR3_DP_VTXID_BASE] = info->indexed ? info->index_bias : info->start, [IR3_DP_VTXCNT_MAX] = max_tf_vtx(ctx, v), @@ -739,3 +777,39 @@ ir3_emit_fs_consts(const struct ir3_shader_variant *v, struct fd_ringbuffer *rin emit_immediates(ctx, v, ring); } } + +/* emit compute-shader consts: */ +void +ir3_emit_cs_consts(const struct ir3_shader_variant *v, struct fd_ringbuffer *ring, + struct fd_context *ctx, const struct pipe_grid_info *info) +{ + enum fd_dirty_shader_state dirty = ctx->dirty_shader[PIPE_SHADER_COMPUTE]; + + if (dirty & (FD_DIRTY_SHADER_PROG | FD_DIRTY_SHADER_CONST)) { + struct fd_constbuf_stateobj *constbuf; + bool shader_dirty; + + constbuf = &ctx->constbuf[PIPE_SHADER_COMPUTE]; + shader_dirty = !!(dirty & FD_DIRTY_SHADER_PROG); + + emit_user_consts(ctx, v, ring, constbuf); + emit_ubos(ctx, v, ring, constbuf); + if (shader_dirty) + emit_immediates(ctx, v, ring); + } + + /* emit compute-shader driver-params: */ + uint32_t offset = v->constbase.driver_param; + if (v->constlen > offset) { + uint32_t compute_params[IR3_DP_CS_COUNT] = { + [IR3_DP_NUM_WORK_GROUPS_X] = info->grid[0], + [IR3_DP_NUM_WORK_GROUPS_Y] = info->grid[1], + [IR3_DP_NUM_WORK_GROUPS_Z] = info->grid[2], + /* do we need work-group-size? */ + }; + + fd_wfi(ctx->batch, ring); + ctx->emit_const(ring, SHADER_COMPUTE, offset * 4, 0, + ARRAY_SIZE(compute_params), compute_params, NULL); + } +} diff --git a/src/gallium/drivers/freedreno/ir3/ir3_shader.h b/src/gallium/drivers/freedreno/ir3/ir3_shader.h index a06dd04..6c2af6d 100644 --- a/src/gallium/drivers/freedreno/ir3/ir3_shader.h +++ b/src/gallium/drivers/freedreno/ir3/ir3_shader.h @@ -38,13 +38,20 @@ /* driver param indices: */ enum ir3_driver_param { + /* compute shader driver params: */ + IR3_DP_NUM_WORK_GROUPS_X = 0, + IR3_DP_NUM_WORK_GROUPS_Y = 1, + IR3_DP_NUM_WORK_GROUPS_Z = 2, + IR3_DP_CS_COUNT = 4, /* must be aligned to vec4 */ + + /* vertex shader driver params: */ IR3_DP_VTXID_BASE = 0, IR3_DP_VTXCNT_MAX = 1, /* user-clip-plane components, up to 8x vec4's: */ IR3_DP_UCP0_X = 4, /* .... */ IR3_DP_UCP7_W = 35, - IR3_DP_COUNT = 36 /* must be aligned to vec4 */ + IR3_DP_VS_COUNT = 36 /* must be aligned to vec4 */ }; /* Configuration key used to identify a shader variant.. different @@ -313,6 +320,10 @@ void * ir3_shader_assemble(struct ir3_shader_variant *v, uint32_t gpu_id); struct ir3_shader * ir3_shader_create(struct ir3_compiler *compiler, const struct pipe_shader_state *cso, enum shader_t type, struct pipe_debug_callback *debug); +struct ir3_shader * +ir3_shader_create_compute(struct ir3_compiler *compiler, + const struct pipe_compute_state *cso, + struct pipe_debug_callback *debug); void ir3_shader_destroy(struct ir3_shader *shader); struct ir3_shader_variant * ir3_shader_variant(struct ir3_shader *shader, struct ir3_shader_key key, struct pipe_debug_callback *debug); @@ -325,6 +336,8 @@ void ir3_emit_vs_consts(const struct ir3_shader_variant *v, struct fd_ringbuffer struct fd_context *ctx, const struct pipe_draw_info *info); void ir3_emit_fs_consts(const struct ir3_shader_variant *v, struct fd_ringbuffer *ring, struct fd_context *ctx); +void ir3_emit_cs_consts(const struct ir3_shader_variant *v, struct fd_ringbuffer *ring, + struct fd_context *ctx, const struct pipe_grid_info *info); static inline const char * ir3_shader_stage(struct ir3_shader *shader) -- 2.7.4