Used for function arguments to compute kernels (ie. OpenCL).
Signed-off-by: Rob Clark <robdclark@chromium.org>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/13300>
}
}
+/* Load a kernel param: src[] = { address }. */
+static void
+emit_intrinsic_load_kernel_input(struct ir3_context *ctx,
+ nir_intrinsic_instr *intr,
+ struct ir3_instruction **dst)
+{
+ const struct ir3_const_state *const_state = ir3_const_state(ctx->so);
+ struct ir3_block *b = ctx->block;
+ unsigned offset = nir_intrinsic_base(intr);
+ unsigned p = regid(const_state->offsets.kernel_params, 0);
+
+ struct ir3_instruction *src0 = ir3_get_src(ctx, &intr->src[0])[0];
+
+ if (is_same_type_mov(src0) && (src0->srcs[0]->flags & IR3_REG_IMMED)) {
+ offset += src0->srcs[0]->iim_val;
+
+ /* kernel param position is in bytes, but constant space is 32b registers: */
+ compile_assert(ctx, !(offset & 0x3));
+
+ dst[0] = create_uniform(b, p + (offset / 4));
+ } else {
+ /* kernel param position is in bytes, but constant space is 32b registers: */
+ compile_assert(ctx, !(offset & 0x3));
+
+ /* TODO we should probably be lowering this in nir, and also handling
+ * non-32b inputs.. Also we probably don't want to be using
+ * SP_MODE_CONTROL.CONSTANT_DEMOTION_ENABLE for KERNEL shaders..
+ */
+ src0 = ir3_SHR_B(b, src0, 0, create_immed(b, 2), 0);
+
+ dst[0] = create_uniform_indirect(b, offset / 4, TYPE_U32,
+ ir3_get_addr0(ctx, src0, 1));
+ }
+}
+
/* src[] = { block_index } */
static void
emit_intrinsic_ssbo_size(struct ir3_context *ctx, nir_intrinsic_instr *intr,
case nir_intrinsic_load_input:
setup_input(ctx, intr);
break;
+ case nir_intrinsic_load_kernel_input:
+ emit_intrinsic_load_kernel_input(ctx, intr, dst);
+ break;
/* All SSBO intrinsics should have been lowered by 'lower_io_offsets'
* pass and replaced by an ir3-specifc version that adds the
* dword-offset in the last source.
constoff += align(cnt, 4) / 4;
}
+ if (v->type == MESA_SHADER_KERNEL) {
+ const_state->offsets.kernel_params = constoff;
+ constoff += align(v->shader->cs.req_input_mem, 4) / 4;
+ }
+
if (const_state->num_driver_params > 0) {
/* num_driver_params in dwords. we only need to align to vec4s for the
* common case of immediate constant uploads, but for indirect dispatch
* user consts
* UBO addresses
* SSBO sizes
+ * image dimensions
* if (vertex shader) {
- * driver params (IR3_DP_*)
+ * driver params (IR3_DP_VS_COUNT)
* if (stream_output.num_outputs > 0)
* stream-out addresses
* } else if (compute_shader) {
- * driver params (IR3_DP_*)
+ * kernel params
+ * driver params (IR3_DP_CS_COUNT)
* }
* immediates
*
/* user const start at zero */
unsigned ubo;
unsigned image_dims;
+ unsigned kernel_params;
unsigned driver_param;
unsigned tfbo;
unsigned primitive_param;
struct nir_shader *nir;
struct ir3_stream_output_info stream_output;
+ /* per shader stage specific info: */
+ union {
+ /* for compute shaders: */
+ struct {
+ unsigned req_input_mem; /* in dwords */
+ } cs;
+ };
+
struct ir3_shader_variant *variants;
mtx_t variants_lock;
}
}
+/* emit kernel params */
+static inline void
+emit_kernel_params(struct fd_context *ctx, const struct ir3_shader_variant *v,
+ struct fd_ringbuffer *ring, const struct pipe_grid_info *info)
+ assert_dt
+{
+ const struct ir3_const_state *const_state = ir3_const_state(v);
+ uint32_t offset = const_state->offsets.kernel_params;
+ if (v->constlen > offset) {
+ ring_wfi(ctx->batch, ring);
+ emit_const_user(ring, v, offset * 4,
+ align(v->shader->cs.req_input_mem, 4),
+ info->input);
+ }
+}
+
static inline void
ir3_emit_vs_driver_params(const struct ir3_shader_variant *v,
struct fd_ringbuffer *ring, struct fd_context *ctx,
debug_assert(gl_shader_stage_is_compute(v->type));
emit_common_consts(v, ring, ctx, PIPE_SHADER_COMPUTE);
+ emit_kernel_params(ctx, v, ring, info);
/* emit compute-shader driver-params: */
const struct ir3_const_state *const_state = ir3_const_state(v);
}
struct ir3_shader *shader = ir3_shader_from_nir(compiler, nir, 0, NULL);
+ shader->cs.req_input_mem = align(cso->req_input_mem, 4) / 4; /* byte->dword */
+
struct ir3_shader_state *hwcso = calloc(1, sizeof(*hwcso));
util_queue_fence_init(&hwcso->ready);