intel/fs: Add support for compiling bindless shaders with resume shaders
authorJason Ekstrand <jason@jlekstrand.net>
Fri, 4 Sep 2020 17:40:06 +0000 (12:40 -0500)
committerMarge Bot <eric+marge@anholt.net>
Tue, 22 Jun 2021 21:09:25 +0000 (21:09 +0000)
Instead of depending on the driver to compile each resume shader
separately, we compile them all in one go in the back-end and build an
SBT as part of the shader program.  Shader relocs are used to make the
entries in the SBT point point to the correct resume shader.

Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/8637>

src/intel/compiler/brw_compiler.h
src/intel/compiler/brw_fs.cpp
src/intel/compiler/brw_fs.h
src/intel/compiler/brw_fs_generator.cpp

index b60d09b..f24db8c 100644 (file)
@@ -678,6 +678,7 @@ enum brw_param_builtin {
 enum brw_shader_reloc_id {
    BRW_SHADER_RELOC_CONST_DATA_ADDR_LOW,
    BRW_SHADER_RELOC_CONST_DATA_ADDR_HIGH,
+   BRW_SHADER_RELOC_SHADER_START_OFFSET,
 };
 
 enum brw_shader_reloc_type {
@@ -1062,8 +1063,15 @@ brw_cs_prog_data_prog_offset(const struct brw_cs_prog_data *prog_data,
 
 struct brw_bs_prog_data {
    struct brw_stage_prog_data base;
+
+   /** SIMD size of the root shader */
    uint8_t simd_size;
-   uint32_t stack_size;
+
+   /** Maximum stack size of all shaders */
+   uint32_t max_stack_size;
+
+   /** Offset into the shader where the resume SBT is located */
+   uint32_t resume_sbt_offset;
 };
 
 struct brw_ff_gs_prog_data {
@@ -1675,6 +1683,8 @@ brw_compile_bs(const struct brw_compiler *compiler, void *log_data,
                const struct brw_bs_prog_key *key,
                struct brw_bs_prog_data *prog_data,
                struct nir_shader *shader,
+               unsigned num_resume_shaders,
+               struct nir_shader **resume_shaders,
                struct brw_compile_stats *stats,
                char **error_str);
 
index d83224b..f8bce4e 100644 (file)
@@ -9875,19 +9875,22 @@ brw_cs_get_dispatch_info(const struct intel_device_info *devinfo,
    return info;
 }
 
-const unsigned *
-brw_compile_bs(const struct brw_compiler *compiler, void *log_data,
-               void *mem_ctx,
-               const struct brw_bs_prog_key *key,
-               struct brw_bs_prog_data *prog_data,
-               nir_shader *shader,
-               struct brw_compile_stats *stats,
-               char **error_str)
+static uint8_t
+compile_single_bs(const struct brw_compiler *compiler, void *log_data,
+                  void *mem_ctx,
+                  const struct brw_bs_prog_key *key,
+                  struct brw_bs_prog_data *prog_data,
+                  nir_shader *shader,
+                  fs_generator *g,
+                  struct brw_compile_stats *stats,
+                  int *prog_offset,
+                  char **error_str)
 {
    const bool debug_enabled = INTEL_DEBUG & DEBUG_RT;
 
    prog_data->base.stage = shader->info.stage;
-   prog_data->stack_size = shader->scratch_size;
+   prog_data->max_stack_size = MAX2(prog_data->max_stack_size,
+                                    shader->scratch_size);
 
    const unsigned max_dispatch_width = 16;
    brw_nir_apply_key(shader, compiler, &key->base, max_dispatch_width, true);
@@ -9897,6 +9900,7 @@ brw_compile_bs(const struct brw_compiler *compiler, void *log_data,
    fs_visitor *v = NULL, *v8 = NULL, *v16 = NULL;
    bool has_spilled = false;
 
+   uint8_t simd_size = 0;
    if (likely(!(INTEL_DEBUG & DEBUG_NO8))) {
       v8 = new fs_visitor(compiler, log_data, mem_ctx, &key->base,
                           &prog_data->base, shader,
@@ -9906,10 +9910,10 @@ brw_compile_bs(const struct brw_compiler *compiler, void *log_data,
          if (error_str)
             *error_str = ralloc_strdup(mem_ctx, v8->fail_msg);
          delete v8;
-         return NULL;
+         return 0;
       } else {
          v = v8;
-         prog_data->simd_size = 8;
+         simd_size = 8;
          if (v8->spilled_any_registers)
             has_spilled = true;
       }
@@ -9932,11 +9936,11 @@ brw_compile_bs(const struct brw_compiler *compiler, void *log_data,
                   v16->fail_msg);
             }
             delete v16;
-            return NULL;
+            return 0;
          }
       } else {
          v = v16;
-         prog_data->simd_size = 16;
+         simd_size = 16;
          if (v16->spilled_any_registers)
             has_spilled = true;
       }
@@ -9948,13 +9952,55 @@ brw_compile_bs(const struct brw_compiler *compiler, void *log_data,
          *error_str = ralloc_strdup(mem_ctx,
             "Cannot satisfy INTEL_DEBUG flags SIMD restrictions");
       }
-      return NULL;
+      return false;
    }
 
    assert(v);
 
+   int offset = g->generate_code(v->cfg, simd_size, v->shader_stats,
+                                 v->performance_analysis.require(), stats);
+   if (prog_offset)
+      *prog_offset = offset;
+   else
+      assert(offset == 0);
+
+   delete v8;
+   delete v16;
+
+   return simd_size;
+}
+
+uint64_t
+brw_bsr(const struct intel_device_info *devinfo,
+        uint32_t offset, uint8_t simd_size, uint8_t local_arg_offset)
+{
+   assert(offset % 64 == 0);
+   assert(simd_size == 8 || simd_size == 16);
+   assert(local_arg_offset % 8 == 0);
+
+   return offset |
+          SET_BITS(simd_size > 8, 4, 4) |
+          SET_BITS(local_arg_offset / 8, 2, 0);
+}
+
+const unsigned *
+brw_compile_bs(const struct brw_compiler *compiler, void *log_data,
+               void *mem_ctx,
+               const struct brw_bs_prog_key *key,
+               struct brw_bs_prog_data *prog_data,
+               nir_shader *shader,
+               unsigned num_resume_shaders,
+               struct nir_shader **resume_shaders,
+               struct brw_compile_stats *stats,
+               char **error_str)
+{
+   const bool debug_enabled = INTEL_DEBUG & DEBUG_RT;
+
+   prog_data->base.stage = shader->info.stage;
+   prog_data->max_stack_size = 0;
+
    fs_generator g(compiler, log_data, mem_ctx, &prog_data->base,
-                  v->runtime_check_aads_emit, shader->info.stage);
+                  false, shader->info.stage);
    if (unlikely(debug_enabled)) {
       char *name = ralloc_asprintf(mem_ctx, "%s %s shader %s",
                                    shader->info.label ?
@@ -9964,13 +10010,48 @@ brw_compile_bs(const struct brw_compiler *compiler, void *log_data,
       g.enable_debug(name);
    }
 
-   g.generate_code(v->cfg, prog_data->simd_size, v->shader_stats,
-                   v->performance_analysis.require(), stats);
+   prog_data->simd_size =
+      compile_single_bs(compiler, log_data, mem_ctx, key, prog_data,
+                        shader, &g, stats, NULL, error_str);
+   if (prog_data->simd_size == 0)
+      return NULL;
 
-   delete v8;
-   delete v16;
+   uint64_t *resume_sbt = ralloc_array(mem_ctx, uint64_t, num_resume_shaders);
+   for (unsigned i = 0; i < num_resume_shaders; i++) {
+      if (INTEL_DEBUG & DEBUG_RT) {
+         char *name = ralloc_asprintf(mem_ctx, "%s %s resume(%u) shader %s",
+                                      shader->info.label ?
+                                         shader->info.label : "unnamed",
+                                      gl_shader_stage_name(shader->info.stage),
+                                      i, shader->info.name);
+         g.enable_debug(name);
+      }
+
+      /* TODO: Figure out shader stats etc. for resume shaders */
+      int offset = 0;
+      uint8_t simd_size =
+         compile_single_bs(compiler, log_data, mem_ctx, key, prog_data,
+                           resume_shaders[i], &g, NULL, &offset, error_str);
+      if (simd_size == 0)
+         return NULL;
+
+      assert(offset > 0);
+      resume_sbt[i] = brw_bsr(compiler->devinfo, offset, simd_size, 0);
+   }
+
+   /* We only have one constant data so we want to make sure they're all the
+    * same.
+    */
+   for (unsigned i = 0; i < num_resume_shaders; i++) {
+      assert(resume_shaders[i]->constant_data_size ==
+             shader->constant_data_size);
+      assert(memcmp(resume_shaders[i]->constant_data,
+                    shader->constant_data,
+                    shader->constant_data_size) == 0);
+   }
 
    g.add_const_data(shader->constant_data, shader->constant_data_size);
+   g.add_resume_sbt(num_resume_shaders, resume_sbt);
 
    return g.get_assembly();
 }
index 57193a1..22cd226 100644 (file)
@@ -478,6 +478,7 @@ public:
                      const brw::performance &perf,
                      struct brw_compile_stats *stats);
    void add_const_data(void *data, unsigned size);
+   void add_resume_sbt(unsigned num_resume_shaders, uint64_t *sbt);
    const unsigned *get_assembly();
 
 private:
index c6e68ad..16240b1 100644 (file)
@@ -2805,6 +2805,24 @@ fs_generator::add_const_data(void *data, unsigned size)
    }
 }
 
+void
+fs_generator::add_resume_sbt(unsigned num_resume_shaders, uint64_t *sbt)
+{
+   assert(brw_shader_stage_is_bindless(stage));
+   struct brw_bs_prog_data *bs_prog_data = brw_bs_prog_data(prog_data);
+   if (num_resume_shaders > 0) {
+      bs_prog_data->resume_sbt_offset =
+         brw_append_data(p, sbt, num_resume_shaders * sizeof(uint64_t), 32);
+      for (unsigned i = 0; i < num_resume_shaders; i++) {
+         size_t offset = bs_prog_data->resume_sbt_offset + i * sizeof(*sbt);
+         assert(offset <= UINT32_MAX);
+         brw_add_reloc(p, BRW_SHADER_RELOC_SHADER_START_OFFSET,
+                       BRW_SHADER_RELOC_TYPE_U32,
+                       (uint32_t)offset, (uint32_t)sbt[i]);
+      }
+   }
+}
+
 const unsigned *
 fs_generator::get_assembly()
 {