intel/rt: Add a helper to create the raygen trampoline shader
authorJason Ekstrand <jason@jlekstrand.net>
Thu, 6 Aug 2020 21:31:05 +0000 (16:31 -0500)
committerMarge Bot <eric+marge@anholt.net>
Wed, 25 Nov 2020 05:37:10 +0000 (05:37 +0000)
Unlike graphics and compute pipelines, Vulkan ray-tracing pipelines do
not have a single entrypoint.  Instead, the raygen shader is specified
as a one-element shader binding table in the vkCmdTraceRay call.  This
means that raygen shaders have to be bindless shaders just like any
other ray tracing shader.  To launch them, we have a tiny compute shader
that acts as a trampoline and sets up the hotzone and uses btd_spawn to
fire off the raygen shader.

Reviewed-by: Caio Marcelo de Oliveira Filho <caio.oliveira@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/7356>

src/intel/compiler/brw_nir_rt.c
src/intel/compiler/brw_nir_rt.h
src/intel/compiler/brw_rt.h

index ca0c43e..9c7f7be 100644 (file)
@@ -368,3 +368,126 @@ brw_nir_lower_combined_intersection_any_hit(nir_shader *intersection,
    NIR_PASS_V(intersection, lower_ray_walk_intrinsics, devinfo);
    lower_rt_io_and_scratch(intersection);
 }
+
+static nir_ssa_def *
+build_load_uniform(nir_builder *b, unsigned offset,
+                   unsigned num_components, unsigned bit_size)
+{
+   nir_intrinsic_instr *load =
+      nir_intrinsic_instr_create(b->shader, nir_intrinsic_load_uniform);
+   load->num_components = num_components;
+   load->src[0] = nir_src_for_ssa(nir_imm_int(b, 0));
+   nir_intrinsic_set_base(load, offset);
+   nir_intrinsic_set_range(load, num_components * bit_size / 8);
+   nir_ssa_dest_init(&load->instr, &load->dest,
+                     num_components, bit_size, NULL);
+   nir_builder_instr_insert(b, &load->instr);
+   return &load->dest.ssa;
+}
+
+#define load_trampoline_param(b, name, num_components, bit_size) \
+   build_load_uniform((b), offsetof(struct brw_rt_raygen_trampoline_params, name), \
+                      (num_components), (bit_size))
+
+nir_shader *
+brw_nir_create_raygen_trampoline(const struct brw_compiler *compiler,
+                                 void *mem_ctx)
+{
+   const struct gen_device_info *devinfo = compiler->devinfo;
+   const nir_shader_compiler_options *nir_options =
+      compiler->glsl_compiler_options[MESA_SHADER_COMPUTE].NirOptions;
+
+   STATIC_ASSERT(sizeof(struct brw_rt_raygen_trampoline_params) == 32);
+
+   nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_COMPUTE,
+                                                  nir_options,
+                                                  "RT Ray-Gen Trampoline");
+   ralloc_steal(mem_ctx, b.shader);
+
+   b.shader->info.cs.local_size_variable = true;
+
+   /* The RT global data and raygen BINDLESS_SHADER_RECORD addresses are
+    * passed in as push constants in the first register.  We deal with the
+    * raygen BSR address here; the global data we'll deal with later.
+    */
+   b.shader->num_uniforms = 32;
+   nir_ssa_def *raygen_bsr_addr =
+      load_trampoline_param(&b, raygen_bsr_addr, 1, 64);
+   nir_ssa_def *local_shift =
+      nir_u2u32(&b, load_trampoline_param(&b, local_group_size_log2, 3, 8));
+
+   nir_ssa_def *global_id = nir_load_work_group_id(&b, 32);
+   nir_ssa_def *simd_channel = nir_load_subgroup_invocation(&b);
+   nir_ssa_def *local_x =
+      nir_ubfe(&b, simd_channel, nir_imm_int(&b, 0),
+                  nir_channel(&b, local_shift, 0));
+   nir_ssa_def *local_y =
+      nir_ubfe(&b, simd_channel, nir_channel(&b, local_shift, 0),
+                  nir_channel(&b, local_shift, 1));
+   nir_ssa_def *local_z =
+      nir_ubfe(&b, simd_channel,
+                  nir_iadd(&b, nir_channel(&b, local_shift, 0),
+                              nir_channel(&b, local_shift, 1)),
+                  nir_channel(&b, local_shift, 2));
+   nir_ssa_def *launch_id =
+      nir_iadd(&b, nir_ishl(&b, global_id, local_shift),
+                  nir_vec3(&b, local_x, local_y, local_z));
+
+   nir_ssa_def *launch_size = nir_load_ray_launch_size(&b);
+   nir_push_if(&b, nir_ball(&b, nir_ult(&b, launch_id, launch_size)));
+   {
+      nir_store_global(&b, brw_nir_rt_sw_hotzone_addr(&b, devinfo), 16,
+                       nir_vec4(&b, nir_imm_int(&b, 0), /* Stack ptr */
+                                    nir_channel(&b, launch_id, 0),
+                                    nir_channel(&b, launch_id, 1),
+                                    nir_channel(&b, launch_id, 2)),
+                       0xf /* write mask */);
+
+      brw_nir_btd_spawn(&b, raygen_bsr_addr);
+   }
+   nir_push_else(&b, NULL);
+   {
+      /* Even though these invocations aren't being used for anything, the
+       * hardware allocated stack IDs for them.  They need to retire them.
+       */
+      brw_nir_btd_retire(&b);
+   }
+   nir_pop_if(&b, NULL);
+
+   nir_shader *nir = b.shader;
+   nir->info.name = ralloc_strdup(nir, "RT: TraceRay trampoline");
+   nir_validate_shader(nir, "in brw_nir_create_raygen_trampoline");
+   brw_preprocess_nir(compiler, nir, NULL);
+
+   NIR_PASS_V(nir, brw_nir_lower_rt_intrinsics, devinfo);
+
+   /* brw_nir_lower_rt_intrinsics will leave us with a btd_global_arg_addr
+    * intrinsic which doesn't exist in compute shaders.  We also created one
+    * above when we generated the BTD spawn intrinsic.  Now we go through and
+    * replace them with a uniform load.
+    */
+   nir_foreach_block(block, b.impl) {
+      nir_foreach_instr_safe(instr, block) {
+         if (instr->type != nir_instr_type_intrinsic)
+            continue;
+
+         nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+         if (intrin->intrinsic != nir_intrinsic_load_btd_global_arg_addr_intel)
+            continue;
+
+         b.cursor = nir_before_instr(&intrin->instr);
+         nir_ssa_def *global_arg_addr =
+            load_trampoline_param(&b, rt_disp_globals_addr, 1, 64);
+         assert(intrin->dest.is_ssa);
+         nir_ssa_def_rewrite_uses(&intrin->dest.ssa,
+                                  nir_src_for_ssa(global_arg_addr));
+         nir_instr_remove(instr);
+      }
+   }
+
+   NIR_PASS_V(nir, brw_nir_lower_cs_intrinsics);
+
+   brw_nir_optimize(nir, compiler, true, false);
+
+   return nir;
+}
index a16f6be..baff0e6 100644 (file)
@@ -64,6 +64,9 @@ void brw_nir_lower_intersection_shader(nir_shader *intersection,
                                        const struct gen_device_info *devinfo);
 
 nir_shader *
+brw_nir_create_raygen_trampoline(const struct brw_compiler *compiler,
+                                 void *mem_ctx);
+nir_shader *
 brw_nir_create_trivial_return_shader(const struct brw_compiler *compiler,
                                      void *mem_ctx);
 
index 330abf4..eebb29b 100644 (file)
@@ -96,6 +96,39 @@ struct brw_rt_scratch_layout {
    uint64_t total_size;
 };
 
+/** Parameters passed to the raygen trampoline shader
+ *
+ * This struct is carefully construected to be 32B and must be passed to the
+ * raygen trampoline shader as as inline constant data.
+ */
+struct brw_rt_raygen_trampoline_params {
+   /** The GPU address of the RT_DISPATCH_GLOBALS */
+   uint64_t rt_disp_globals_addr;
+
+   /** The GPU address of the BINDLESS_SHADER_RECORD for the raygen shader */
+   uint64_t raygen_bsr_addr;
+
+   /** 1 if this is an indirect dispatch, 0 otherwise */
+   uint8_t is_indirect;
+
+   /** The integer log2 of the local group size
+    *
+    * Ray-tracing shaders don't have a concept of local vs. global workgroup
+    * size.  They only have a single 3D launch size.  The raygen trampoline
+    * shader is always dispatched with a local workgroup size equal to the
+    * SIMD width but the shape of the local workgroup is determined at
+    * dispatch time based on the shape of the launch and passed to the
+    * trampoline via this field.  (There's no sense having a Z dimension on
+    * the local workgroup if the launch is 2D.)
+    *
+    * We use the integer log2 of the size because there's no point in
+    * non-power-of-two sizes and  shifts are cheaper than division.
+    */
+   uint8_t local_group_size_log2[3];
+
+   uint32_t pad[3];
+};
+
 /** Size of the "hot zone" in bytes
  *
  * The hot zone is a SW-defined data structure which is a single uvec4