From 10c17f23b752c54f5388e0f40e3d534477c8e500 Mon Sep 17 00:00:00 2001 From: Rob Clark Date: Thu, 4 May 2017 13:24:37 -0400 Subject: [PATCH] freedreno: core compute state support Signed-off-by: Rob Clark --- src/gallium/drivers/freedreno/freedreno_context.h | 14 ++- src/gallium/drivers/freedreno/freedreno_draw.c | 47 +++++++++ src/gallium/drivers/freedreno/freedreno_gmem.c | 7 ++ src/gallium/drivers/freedreno/freedreno_gmem.h | 1 + src/gallium/drivers/freedreno/freedreno_screen.c | 115 +++++++++++++++++++++- src/gallium/drivers/freedreno/freedreno_screen.h | 6 ++ src/gallium/drivers/freedreno/freedreno_state.c | 32 ++++++ 7 files changed, 216 insertions(+), 6 deletions(-) diff --git a/src/gallium/drivers/freedreno/freedreno_context.h b/src/gallium/drivers/freedreno/freedreno_context.h index 7e940bc..698af4e 100644 --- a/src/gallium/drivers/freedreno/freedreno_context.h +++ b/src/gallium/drivers/freedreno/freedreno_context.h @@ -255,6 +255,7 @@ struct fd_context { /* per shader-stage dirty status: */ enum fd_dirty_shader_state dirty_shader[PIPE_SHADER_TYPES]; + void *compute; struct pipe_blend_state *blend; struct pipe_rasterizer_state *rasterizer; struct pipe_depth_stencil_alpha_state *zsa; @@ -299,6 +300,9 @@ struct fd_context { void (*clear)(struct fd_context *ctx, unsigned buffers, const union pipe_color_union *color, double depth, unsigned stencil); + /* compute: */ + void (*launch_grid)(struct fd_context *ctx, const struct pipe_grid_info *info); + /* constant emit: (note currently not used/needed for a2xx) */ void (*emit_const)(struct fd_ringbuffer *ring, enum shader_t type, uint32_t regid, uint32_t offset, uint32_t sizedwords, @@ -376,8 +380,16 @@ static inline void fd_context_all_clean(struct fd_context *ctx) { ctx->dirty = 0; - for (unsigned i = 0; i < PIPE_SHADER_TYPES; i++) + for (unsigned i = 0; i < PIPE_SHADER_TYPES; i++) { + /* don't mark compute state as clean, since it is not emitted + * during normal draw call. The places that call _all_dirty(), + * it is safe to mark compute state dirty as well, but the + * inverse is not true. + */ + if (i == PIPE_SHADER_COMPUTE) + continue; ctx->dirty_shader[i] = 0; + } } static inline struct pipe_scissor_state * diff --git a/src/gallium/drivers/freedreno/freedreno_draw.c b/src/gallium/drivers/freedreno/freedreno_draw.c index 6691f65..4e7827d 100644 --- a/src/gallium/drivers/freedreno/freedreno_draw.c +++ b/src/gallium/drivers/freedreno/freedreno_draw.c @@ -408,6 +408,49 @@ fd_clear_depth_stencil(struct pipe_context *pctx, struct pipe_surface *ps, buffers, depth, stencil, x, y, w, h); } +static void +fd_launch_grid(struct pipe_context *pctx, const struct pipe_grid_info *info) +{ + struct fd_context *ctx = fd_context(pctx); + struct fd_batch *batch, *save_batch = NULL; + unsigned i; + + /* TODO maybe we don't want to allocate and flush a batch each time? + * We could use a special bogus (ie. won't match any fb state) key + * in the batch-case for compute shaders, and rely on the rest of + * the dependency tracking mechanism to tell us when the compute + * batch needs to be flushed? + */ + batch = fd_bc_alloc_batch(&ctx->screen->batch_cache, ctx); + fd_batch_reference(&save_batch, ctx->batch); + fd_batch_reference(&ctx->batch, batch); + + mtx_lock(&ctx->screen->lock); + + /* Mark SSBOs as being written.. we don't actually know which ones are + * read vs written, so just assume the worst + */ + foreach_bit(i, ctx->shaderbuf[PIPE_SHADER_COMPUTE].enabled_mask) + resource_read(batch, ctx->shaderbuf[PIPE_SHADER_COMPUTE].sb[i].buffer); + + /* UBO's are read */ + foreach_bit(i, ctx->constbuf[PIPE_SHADER_COMPUTE].enabled_mask) + resource_read(batch, ctx->constbuf[PIPE_SHADER_COMPUTE].cb[i].buffer); + + /* Mark textures as being read */ + foreach_bit(i, ctx->tex[PIPE_SHADER_COMPUTE].valid_textures) + resource_read(batch, ctx->tex[PIPE_SHADER_COMPUTE].textures[i]->texture); + + mtx_unlock(&ctx->screen->lock); + + ctx->launch_grid(ctx, info); + + fd_gmem_flush_compute(batch); + + fd_batch_reference(&ctx->batch, save_batch); + fd_batch_reference(&save_batch, NULL); +} + void fd_draw_init(struct pipe_context *pctx) { @@ -415,4 +458,8 @@ fd_draw_init(struct pipe_context *pctx) pctx->clear = fd_clear; pctx->clear_render_target = fd_clear_render_target; pctx->clear_depth_stencil = fd_clear_depth_stencil; + + if (has_compute(fd_screen(pctx->screen))) { + pctx->launch_grid = fd_launch_grid; + } } diff --git a/src/gallium/drivers/freedreno/freedreno_gmem.c b/src/gallium/drivers/freedreno/freedreno_gmem.c index ded2321..23be047 100644 --- a/src/gallium/drivers/freedreno/freedreno_gmem.c +++ b/src/gallium/drivers/freedreno/freedreno_gmem.c @@ -439,6 +439,13 @@ fd_gmem_render_noop(struct fd_batch *batch) flush_ring(batch); } +void +fd_gmem_flush_compute(struct fd_batch *batch) +{ + render_sysmem(batch); + flush_ring(batch); +} + /* tile needs restore if it isn't completely contained within the * cleared scissor: */ diff --git a/src/gallium/drivers/freedreno/freedreno_gmem.h b/src/gallium/drivers/freedreno/freedreno_gmem.h index 6598ea9..42a8dfa 100644 --- a/src/gallium/drivers/freedreno/freedreno_gmem.h +++ b/src/gallium/drivers/freedreno/freedreno_gmem.h @@ -63,6 +63,7 @@ struct fd_batch; void fd_gmem_render_tiles(struct fd_batch *batch); void fd_gmem_render_noop(struct fd_batch *batch); +void fd_gmem_flush_compute(struct fd_batch *batch); bool fd_gmem_needs_restore(struct fd_batch *batch, struct fd_tile *tile, uint32_t buffers); diff --git a/src/gallium/drivers/freedreno/freedreno_screen.c b/src/gallium/drivers/freedreno/freedreno_screen.c index 15293b1..052565d 100644 --- a/src/gallium/drivers/freedreno/freedreno_screen.c +++ b/src/gallium/drivers/freedreno/freedreno_screen.c @@ -189,13 +189,15 @@ fd_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param) case PIPE_CAP_USER_CONSTANT_BUFFERS: return is_a4xx(screen) ? 0 : 1; + case PIPE_CAP_COMPUTE: + return has_compute(screen); + case PIPE_CAP_SHADER_STENCIL_EXPORT: case PIPE_CAP_TGSI_TEXCOORD: case PIPE_CAP_PREFER_BLIT_BASED_TEXTURE_TRANSFER: case PIPE_CAP_TEXTURE_MULTISAMPLE: case PIPE_CAP_TEXTURE_BARRIER: case PIPE_CAP_TEXTURE_MIRROR_CLAMP: - case PIPE_CAP_COMPUTE: case PIPE_CAP_QUERY_MEMORY_INFO: case PIPE_CAP_PCI_GROUP: case PIPE_CAP_PCI_BUS: @@ -454,6 +456,9 @@ fd_screen_get_shader_param(struct pipe_screen *pscreen, case PIPE_SHADER_VERTEX: break; case PIPE_SHADER_COMPUTE: + if (has_compute(screen)) + break; + return 0; case PIPE_SHADER_GEOMETRY: /* maye we could emulate.. */ return 0; @@ -514,13 +519,30 @@ fd_screen_get_shader_param(struct pipe_screen *pscreen, case PIPE_SHADER_CAP_MAX_SAMPLER_VIEWS: return 16; case PIPE_SHADER_CAP_PREFERRED_IR: - if ((fd_mesa_debug & FD_DBG_NIR) && is_ir3(screen)) + switch (shader) { + case PIPE_SHADER_FRAGMENT: + case PIPE_SHADER_VERTEX: + if ((fd_mesa_debug & FD_DBG_NIR) && is_ir3(screen)) + return PIPE_SHADER_IR_NIR; + return PIPE_SHADER_IR_TGSI; + default: + /* tgsi_to_nir doesn't really support much beyond FS/VS: */ + debug_assert(is_ir3(screen)); return PIPE_SHADER_IR_NIR; - return PIPE_SHADER_IR_TGSI; + } + break; case PIPE_SHADER_CAP_SUPPORTED_IRS: + if (is_ir3(screen)) { + return (1 << PIPE_SHADER_IR_NIR) | (1 << PIPE_SHADER_IR_TGSI); + } else { + return (1 << PIPE_SHADER_IR_TGSI); + } return 0; case PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT: return 32; + case PIPE_SHADER_CAP_LOWER_IF_THRESHOLD: + case PIPE_SHADER_CAP_TGSI_SKIP_MERGE_REGISTERS: + return 0; case PIPE_SHADER_CAP_MAX_SHADER_BUFFERS: if (is_a5xx(screen)) { /* a5xx (and a4xx for that matter) has one state-block @@ -552,14 +574,96 @@ fd_screen_get_shader_param(struct pipe_screen *pscreen, } return 0; case PIPE_SHADER_CAP_MAX_SHADER_IMAGES: - case PIPE_SHADER_CAP_LOWER_IF_THRESHOLD: - case PIPE_SHADER_CAP_TGSI_SKIP_MERGE_REGISTERS: + /* probably should be same as MAX_SHADRER_BUFFERS but not implemented yet */ return 0; } debug_printf("unknown shader param %d\n", param); return 0; } +/* TODO depending on how much the limits differ for a3xx/a4xx, maybe move this + * into per-generation backend? + */ +static int +fd_get_compute_param(struct pipe_screen *pscreen, enum pipe_shader_ir ir_type, + enum pipe_compute_cap param, void *ret) +{ + struct fd_screen *screen = fd_screen(pscreen); + const char *ir = "ir3"; + + if (!has_compute(screen)) + return 0; + + switch (param) { + case PIPE_COMPUTE_CAP_ADDRESS_BITS: + if (ret) { + uint32_t *address_bits = ret; + address_bits[0] = 32; + + if (is_a5xx(screen)) + address_bits[0] = 64; + } + return 1 * sizeof(uint32_t); + + case PIPE_COMPUTE_CAP_IR_TARGET: + if (ret) + sprintf(ret, ir); + return strlen(ir) * sizeof(char); + + case PIPE_COMPUTE_CAP_GRID_DIMENSION: + if (ret) { + uint64_t *grid_dimension = ret; + grid_dimension[0] = 3; + } + return 1 * sizeof(uint64_t); + + case PIPE_COMPUTE_CAP_MAX_GRID_SIZE: + if (ret) { + uint64_t *grid_size = ret; + grid_size[0] = 65535; + grid_size[1] = 65535; + grid_size[2] = 65535; + } + return 3 * sizeof(uint64_t) ; + + case PIPE_COMPUTE_CAP_MAX_BLOCK_SIZE: + if (ret) { + uint64_t *grid_size = ret; + grid_size[0] = 1024; + grid_size[1] = 1024; + grid_size[2] = 64; + } + return 3 * sizeof(uint64_t) ; + + case PIPE_COMPUTE_CAP_MAX_THREADS_PER_BLOCK: + if (ret) { + uint64_t *max_threads_per_block = ret; + *max_threads_per_block = 1024; + } + return sizeof(uint64_t); + + case PIPE_COMPUTE_CAP_MAX_GLOBAL_SIZE: + case PIPE_COMPUTE_CAP_MAX_LOCAL_SIZE: + case PIPE_COMPUTE_CAP_MAX_PRIVATE_SIZE: + case PIPE_COMPUTE_CAP_MAX_INPUT_SIZE: + break; + case PIPE_COMPUTE_CAP_MAX_MEM_ALLOC_SIZE: + if (ret) { + uint64_t *max = ret; + *max = 32768; + } + return sizeof(uint64_t); + case PIPE_COMPUTE_CAP_MAX_CLOCK_FREQUENCY: + case PIPE_COMPUTE_CAP_MAX_COMPUTE_UNITS: + case PIPE_COMPUTE_CAP_IMAGES_SUPPORTED: + case PIPE_COMPUTE_CAP_SUBGROUP_SIZE: + case PIPE_COMPUTE_CAP_MAX_VARIABLE_THREADS_PER_BLOCK: + break; + } + + return 0; +} + static const void * fd_get_compiler_options(struct pipe_screen *pscreen, enum pipe_shader_ir ir, unsigned shader) @@ -752,6 +856,7 @@ fd_screen_create(struct fd_device *dev) pscreen->get_param = fd_screen_get_param; pscreen->get_paramf = fd_screen_get_paramf; pscreen->get_shader_param = fd_screen_get_shader_param; + pscreen->get_compute_param = fd_get_compute_param; pscreen->get_compiler_options = fd_get_compiler_options; fd_resource_screen_init(pscreen); diff --git a/src/gallium/drivers/freedreno/freedreno_screen.h b/src/gallium/drivers/freedreno/freedreno_screen.h index dac7224..83c0449 100644 --- a/src/gallium/drivers/freedreno/freedreno_screen.h +++ b/src/gallium/drivers/freedreno/freedreno_screen.h @@ -128,4 +128,10 @@ is_ir3(struct fd_screen *screen) return is_a3xx(screen) || is_a4xx(screen) || is_a5xx(screen); } +static inline bool +has_compute(struct fd_screen *screen) +{ + return false; +} + #endif /* FREEDRENO_SCREEN_H_ */ diff --git a/src/gallium/drivers/freedreno/freedreno_state.c b/src/gallium/drivers/freedreno/freedreno_state.c index bc9fe4a..75bf1b1 100644 --- a/src/gallium/drivers/freedreno/freedreno_state.c +++ b/src/gallium/drivers/freedreno/freedreno_state.c @@ -451,6 +451,32 @@ fd_set_stream_output_targets(struct pipe_context *pctx, ctx->dirty |= FD_DIRTY_STREAMOUT; } +static void +fd_bind_compute_state(struct pipe_context *pctx, void *state) +{ + struct fd_context *ctx = fd_context(pctx); + ctx->compute = state; + ctx->dirty_shader[PIPE_SHADER_COMPUTE] |= FD_DIRTY_SHADER_PROG; +} + +static void +fd_set_compute_resources(struct pipe_context *pctx, + unsigned start, unsigned count, struct pipe_surface **prscs) +{ + // TODO +} + +static void +fd_set_global_binding(struct pipe_context *pctx, + unsigned first, unsigned count, struct pipe_resource **prscs, + uint32_t **handles) +{ + /* TODO only used by clover.. seems to need us to return the actual + * gpuaddr of the buffer.. which isn't really exposed to mesa atm. + * How is this used? + */ +} + void fd_state_init(struct pipe_context *pctx) { @@ -484,4 +510,10 @@ fd_state_init(struct pipe_context *pctx) pctx->create_stream_output_target = fd_create_stream_output_target; pctx->stream_output_target_destroy = fd_stream_output_target_destroy; pctx->set_stream_output_targets = fd_set_stream_output_targets; + + if (has_compute(fd_screen(pctx->screen))) { + pctx->bind_compute_state = fd_bind_compute_state; + pctx->set_compute_resources = fd_set_compute_resources; + pctx->set_global_binding = fd_set_global_binding; + } } -- 2.7.4