From 61663859bc9b41177ac399e0fcd411019e733f70 Mon Sep 17 00:00:00 2001
From: Alyssa Rosenzweig <alyssa@rosenzweig.io>
Date: Sat, 25 Feb 2023 11:55:24 -0500
Subject: [PATCH] asahi: Wire up compute kernels

Now that we have multiple sysval tables, implementing compute kernels --
including with indirect dispatch and load_num_workgroups -- is straightforward.
This patch adds the straightforward launch_grid implementation.

As usual needs UAPI support patches to actually do anything, but the relevant
compute tests are passing downstream.

It's not possible to properly test compute shaders support right now (pending
support for images), so we don't update the CAPs or features.txt here. This is
more about flushing out the piles of downstream patches we have (and getting
reviewed!) in preparation for cutting a downstream release soon.

Signed-off-by: Alyssa Rosenzweig <alyssa@rosenzweig.io>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/21703>
---
 src/gallium/drivers/asahi/agx_nir_lower_sysvals.c |   2 +
 src/gallium/drivers/asahi/agx_state.c             | 114 +++++++++++++++++++++-
 src/gallium/drivers/asahi/agx_state.h             |  17 +++-
 3 files changed, 127 insertions(+), 6 deletions(-)

diff --git a/src/gallium/drivers/asahi/agx_nir_lower_sysvals.c b/src/gallium/drivers/asahi/agx_nir_lower_sysvals.c
index 1d6ff1b..b57d9e1 100644
--- a/src/gallium/drivers/asahi/agx_nir_lower_sysvals.c
+++ b/src/gallium/drivers/asahi/agx_nir_lower_sysvals.c
@@ -109,6 +109,8 @@ lower_intrinsic(nir_builder *b, nir_intrinsic_instr *intr)
    case nir_intrinsic_get_ssbo_size:
       return load_sysval_indirect(b, 1, 32, AGX_SYSVAL_TABLE_ROOT,
                                   &u->ssbo_size, intr->src[0].ssa);
+   case nir_intrinsic_load_num_workgroups:
+      return load_sysval(b, 3, 32, AGX_SYSVAL_TABLE_GRID, 0);
    default:
       return NULL;
    }
diff --git a/src/gallium/drivers/asahi/agx_state.c b/src/gallium/drivers/asahi/agx_state.c
index bb5b2c0..b6a055a 100644
--- a/src/gallium/drivers/asahi/agx_state.c
+++ b/src/gallium/drivers/asahi/agx_state.c
@@ -1602,6 +1602,8 @@ agx_create_compute_state(struct pipe_context *pctx,
    if (!so)
       return NULL;
 
+   so->static_shared_mem = cso->static_shared_mem;
+
    so->variants = _mesa_hash_table_create(NULL, asahi_cs_shader_key_hash,
                                           asahi_cs_shader_key_equal);
 
@@ -1745,7 +1747,7 @@ agx_delete_shader_state(struct pipe_context *ctx, void *cso)
 
 static uint32_t
 agx_build_pipeline(struct agx_batch *batch, struct agx_compiled_shader *cs,
-                   enum pipe_shader_type stage)
+                   enum pipe_shader_type stage, unsigned variable_shared_mem)
 {
    struct agx_context *ctx = batch->ctx;
    unsigned nr_textures = ctx->stage[stage].texture_count;
@@ -1840,6 +1842,7 @@ agx_build_pipeline(struct agx_batch *batch, struct agx_compiled_shader *cs,
     */
    uint64_t uniform_tables[AGX_NUM_SYSVAL_TABLES] = {
       agx_upload_uniforms(batch, T_tex.gpu, stage),
+      ctx->grid_info,
    };
 
    for (unsigned i = 0; i < cs->push_range_count; ++i) {
@@ -1847,10 +1850,21 @@ agx_build_pipeline(struct agx_batch *batch, struct agx_compiled_shader *cs,
                       uniform_tables[cs->push[i].table] + cs->push[i].offset);
    }
 
-   if (stage == PIPE_SHADER_FRAGMENT)
+   if (stage == PIPE_SHADER_FRAGMENT) {
       agx_usc_tilebuffer(&b, &batch->tilebuffer_layout);
-   else
+   } else if (stage == PIPE_SHADER_COMPUTE) {
+      unsigned size =
+         ctx->stage[PIPE_SHADER_COMPUTE].shader->static_shared_mem +
+         variable_shared_mem;
+
+      agx_usc_pack(&b, SHARED, cfg) {
+         cfg.layout = AGX_SHARED_LAYOUT_VERTEX_COMPUTE;
+         cfg.bytes_per_threadgroup = size > 0 ? size : 65536;
+         cfg.uses_shared_memory = size > 0;
+      }
+   } else {
       agx_usc_shared_none(&b);
+   }
 
    agx_usc_pack(&b, SHADER, cfg) {
       cfg.loads_varyings = (stage == PIPE_SHADER_FRAGMENT);
@@ -2137,7 +2151,8 @@ agx_encode_state(struct agx_batch *batch, uint8_t *out, bool is_lines,
       out += AGX_VDM_STATE_VERTEX_SHADER_WORD_0_LENGTH;
 
       agx_pack(out, VDM_STATE_VERTEX_SHADER_WORD_1, cfg) {
-         cfg.pipeline = agx_build_pipeline(batch, ctx->vs, PIPE_SHADER_VERTEX);
+         cfg.pipeline =
+            agx_build_pipeline(batch, ctx->vs, PIPE_SHADER_VERTEX, 0);
       }
       out += AGX_VDM_STATE_VERTEX_SHADER_WORD_1_LENGTH;
 
@@ -2310,7 +2325,7 @@ agx_encode_state(struct agx_batch *batch, uint8_t *out, bool is_lines,
 
       agx_ppp_push(&ppp, FRAGMENT_SHADER, cfg) {
          cfg.pipeline =
-            agx_build_pipeline(batch, ctx->fs, PIPE_SHADER_FRAGMENT),
+            agx_build_pipeline(batch, ctx->fs, PIPE_SHADER_FRAGMENT, 0),
          cfg.uniform_register_count = ctx->fs->info.push_count;
          cfg.preshader_register_count = ctx->fs->info.nr_preamble_gprs;
          cfg.texture_state_register_count = frag_tex_count;
@@ -2662,6 +2677,94 @@ agx_texture_barrier(struct pipe_context *pipe, unsigned flags)
    agx_flush_all(ctx, "Texture barrier");
 }
 
+static void
+agx_launch_grid(struct pipe_context *pipe, const struct pipe_grid_info *info)
+{
+   struct agx_context *ctx = agx_context(pipe);
+   struct agx_batch *batch = agx_get_compute_batch(ctx);
+
+   /* To implement load_num_workgroups, the number of workgroups needs to be
+    * available in GPU memory. This is either the indirect buffer, or just a
+    * buffer we upload ourselves if not indirect.
+    */
+   if (info->indirect) {
+      struct agx_resource *indirect = agx_resource(info->indirect);
+      agx_batch_reads(batch, indirect);
+
+      ctx->grid_info = indirect->bo->ptr.gpu + info->indirect_offset;
+   } else {
+      static_assert(sizeof(info->grid) == 12,
+                    "matches indirect dispatch buffer");
+
+      ctx->grid_info = agx_pool_upload_aligned(&batch->pool, info->grid,
+                                               sizeof(info->grid), 4);
+   }
+
+   struct agx_uncompiled_shader *uncompiled =
+      ctx->stage[PIPE_SHADER_COMPUTE].shader;
+
+   /* There is exactly one variant, get it */
+   struct agx_compiled_shader *cs =
+      _mesa_hash_table_next_entry(uncompiled->variants, NULL)->data;
+
+   agx_batch_add_bo(batch, cs->bo);
+
+   /* TODO: Ensure space if we allow multiple kernels in a batch */
+   uint8_t *out = batch->encoder_current;
+
+   unsigned nr_textures = ctx->stage[PIPE_SHADER_COMPUTE].texture_count;
+   agx_pack(out, CDM_HEADER, cfg) {
+      if (info->indirect)
+         cfg.mode = AGX_CDM_MODE_INDIRECT_GLOBAL;
+      else
+         cfg.mode = AGX_CDM_MODE_DIRECT;
+
+      cfg.uniform_register_count = cs->info.push_count;
+      cfg.preshader_register_count = cs->info.nr_preamble_gprs;
+      cfg.texture_state_register_count = nr_textures;
+      cfg.sampler_state_register_count = agx_translate_sampler_state_count(
+         nr_textures, ctx->stage[PIPE_SHADER_COMPUTE].custom_borders);
+      cfg.pipeline = agx_build_pipeline(batch, cs, PIPE_SHADER_COMPUTE,
+                                        info->variable_shared_mem);
+   }
+   out += AGX_CDM_HEADER_LENGTH;
+
+   if (info->indirect) {
+      agx_pack(out, CDM_INDIRECT, cfg) {
+         cfg.address_hi = ctx->grid_info >> 32;
+         cfg.address_lo = ctx->grid_info & BITFIELD64_MASK(32);
+      }
+      out += AGX_CDM_INDIRECT_LENGTH;
+   } else {
+      agx_pack(out, CDM_GLOBAL_SIZE, cfg) {
+         cfg.x = info->grid[0] * info->block[0];
+         cfg.y = info->grid[1] * info->block[1];
+         cfg.z = info->grid[2] * info->block[2];
+      }
+      out += AGX_CDM_GLOBAL_SIZE_LENGTH;
+   }
+
+   agx_pack(out, CDM_LOCAL_SIZE, cfg) {
+      cfg.x = info->block[0];
+      cfg.y = info->block[1];
+      cfg.z = info->block[2];
+   }
+   out += AGX_CDM_LOCAL_SIZE_LENGTH;
+
+   agx_pack(out, CDM_LAUNCH, cfg)
+      ;
+   out += AGX_CDM_LAUNCH_LENGTH;
+
+   batch->encoder_current = out;
+   assert(batch->encoder_current <= batch->encoder_end &&
+          "Failed to reserve sufficient space in encoder");
+   /* TODO: Dirty tracking? */
+
+   /* TODO: Allow multiple kernels in a batch? */
+   agx_flush_batch_for_reason(ctx, batch, "Compute kernel serialization");
+   ctx->grid_info = 0;
+}
+
 void agx_init_state_functions(struct pipe_context *ctx);
 
 void
@@ -2709,6 +2812,7 @@ agx_init_state_functions(struct pipe_context *ctx)
    ctx->sampler_view_destroy = agx_sampler_view_destroy;
    ctx->surface_destroy = agx_surface_destroy;
    ctx->draw_vbo = agx_draw_vbo;
+   ctx->launch_grid = agx_launch_grid;
    ctx->create_stream_output_target = agx_create_stream_output_target;
    ctx->stream_output_target_destroy = agx_stream_output_target_destroy;
    ctx->set_stream_output_targets = agx_set_stream_output_targets;
diff --git a/src/gallium/drivers/asahi/agx_state.h b/src/gallium/drivers/asahi/agx_state.h
index 6443a2a..7ae44d5 100644
--- a/src/gallium/drivers/asahi/agx_state.h
+++ b/src/gallium/drivers/asahi/agx_state.h
@@ -68,7 +68,11 @@ agx_so_target(struct pipe_stream_output_target *target)
  * compiler. The layout is up to us and handled by our code lowering system
  * values to uniforms.
  */
-enum agx_sysval_table { AGX_SYSVAL_TABLE_ROOT, AGX_NUM_SYSVAL_TABLES };
+enum agx_sysval_table {
+   AGX_SYSVAL_TABLE_ROOT,
+   AGX_SYSVAL_TABLE_GRID,
+   AGX_NUM_SYSVAL_TABLES
+};
 
 /* Root system value table */
 struct PACKED agx_draw_uniforms {
@@ -137,6 +141,9 @@ struct agx_uncompiled_shader {
    uint8_t nir_sha1[20];
    struct hash_table *variants;
 
+   /* For compute kernels */
+   unsigned static_shared_mem;
+
    /* Set on VS, passed to FS for linkage */
    unsigned base_varying;
 };
@@ -313,6 +320,14 @@ struct agx_context {
    uint16_t sample_mask;
    struct pipe_framebuffer_state framebuffer;
 
+   /* During a launch_grid call, a GPU pointer to
+    *
+    *    uint32_t num_workgroups[3];
+    *
+    * When indirect dispatch is used, that's just the indirect dispatch buffer.
+    */
+   uint64_t grid_info;
+
    struct pipe_query *cond_query;
    bool cond_cond;
    enum pipe_render_cond_flag cond_mode;
-- 
2.7.4