nir/lower_task_shader: fix task payload corruption when shared memory workaround...

author Marcin Ślusarz <marcin.slusarz@intel.com>

Wed, 30 Nov 2022 12:47:19 +0000 (13:47 +0100)

committer Marge Bot <emma+marge@anholt.net>

Tue, 6 Dec 2022 16:31:11 +0000 (16:31 +0000)
author Marcin Ślusarz <marcin.slusarz@intel.com>
Wed, 30 Nov 2022 12:47:19 +0000 (13:47 +0100)
committer Marge Bot <emma+marge@anholt.net>
Tue, 6 Dec 2022 16:31:11 +0000 (16:31 +0000)
diff --git a/src/compiler/nir/nir_lower_task_shader.c b/src/compiler/nir/nir_lower_task_shader.c

index 07dec31..c4013a0 100644 (file)
--- a/src/compiler/nir/nir_lower_task_shader.c
+++ b/src/compiler/nir/nir_lower_task_shader.c
@@ -192,21 +192,49 @@ lower_task_payload_to_shared(nir_builder *b,
  }
  
  static void
+copy_shared_to_payload(nir_builder *b,
+                       unsigned num_components,
+                       nir_ssa_def *addr,
+                       unsigned shared_base,
+                       unsigned off)
+{
+   /* Read from shared memory. */
+   nir_ssa_def *copy = nir_load_shared(b, num_components, 32, addr,
+                                       .align_mul = 16,
+                                       .base = shared_base + off);
+
+   /* Write to task payload memory. */
+   nir_store_task_payload(b, copy, addr, .base = off);
+}
+
+static void
  emit_shared_to_payload_copy(nir_builder *b,
                              uint32_t payload_addr,
                              uint32_t payload_size,
                              lower_task_state *s)
  {
+   /* Copy from shared memory to task payload using as much parallelism
+    * as possible. This is achieved by splitting the work into max 3 phases:
+    * 1) copy maximum number of vec4s using all invocations within workgroup
+    * 2) copy maximum number of vec4s using some invocations
+    * 3) copy remaining dwords (< 4) using only the first invocation
+    */
     const unsigned invocations = b->shader->info.workgroup_size[0] *
-                          b->shader->info.workgroup_size[1] *
-                          b->shader->info.workgroup_size[2];
-   const unsigned bytes_per_copy = 16;
-   const unsigned copies_needed = DIV_ROUND_UP(payload_size, bytes_per_copy);
-   const unsigned copies_per_invocation = DIV_ROUND_UP(copies_needed, invocations);
+                                b->shader->info.workgroup_size[1] *
+                                b->shader->info.workgroup_size[2];
+   const unsigned vec4size = 16;
+   const unsigned whole_wg_vec4_copies = payload_size / vec4size;
+   const unsigned vec4_copies_per_invocation = whole_wg_vec4_copies / invocations;
+   const unsigned remaining_vec4_copies = whole_wg_vec4_copies % invocations;
+   const unsigned remaining_dwords =
+         DIV_ROUND_UP(payload_size
+                       - vec4size * vec4_copies_per_invocation * invocations
+                       - vec4size * remaining_vec4_copies,
+                      4);
     const unsigned base_shared_addr = s->payload_shared_addr + payload_addr;
  
     nir_ssa_def *invocation_index = nir_load_local_invocation_index(b);
-   nir_ssa_def *addr = nir_imul_imm(b, invocation_index, bytes_per_copy);
+   nir_ssa_def *addr = nir_imul_imm(b, invocation_index, vec4size);
  
     /* Wait for all previous shared stores to finish.
      * This is necessary because we placed the payload in shared memory.
@@ -216,22 +244,50 @@ emit_shared_to_payload_copy(nir_builder *b,
                           .memory_semantics = NIR_MEMORY_ACQ_REL,
                           .memory_modes = nir_var_mem_shared);
  
-   for (unsigned i = 0; i < copies_per_invocation; ++i) {
-      /* Payload_size is a size of user-accessible payload, but on some
-       * hardware (e.g. Intel) payload has a private header, which we have
-       * to offset (payload_offset_in_bytes).
-       */
-      unsigned const_off =
-            bytes_per_copy * invocations * i + s->payload_offset_in_bytes;
+   /* Payload_size is a size of user-accessible payload, but on some
+    * hardware (e.g. Intel) payload has a private header, which we have
+    * to offset (payload_offset_in_bytes).
+    */
+   unsigned off = s->payload_offset_in_bytes;
  
-      /* Read from shared memory. */
-      nir_ssa_def *copy =
-         nir_load_shared(b, 4, 32, addr, .align_mul = 16,
-                         .base = base_shared_addr + const_off);
+   /* Technically dword-alignment is not necessary for correctness
+    * of the code below, but even if backend implements unaligned
+    * load/stores, they will very likely be slow(er).
+    */
+   assert(off % 4 == 0);
  
-      /* Write to task payload memory. */
-      nir_store_task_payload(b, copy, addr, .base = const_off);
+   /* Copy full vec4s using all invocations in workgroup. */
+   for (unsigned i = 0; i < vec4_copies_per_invocation; ++i) {
+      copy_shared_to_payload(b, vec4size / 4, addr, base_shared_addr, off);
+      off += vec4size * invocations;
     }
+
+   /* Copy full vec4s using only the invocations needed to not overflow. */
+   if (remaining_vec4_copies > 0) {
+      assert(remaining_vec4_copies < invocations);
+
+      nir_ssa_def *cmp = nir_ilt(b, invocation_index, nir_imm_int(b, remaining_vec4_copies));
+      nir_if *if_stmt = nir_push_if(b, cmp);
+      {
+         copy_shared_to_payload(b, vec4size / 4, addr, base_shared_addr, off);
+      }
+      nir_pop_if(b, if_stmt);
+      off += vec4size * remaining_vec4_copies;
+   }
+
+   /* Copy the last few dwords not forming full vec4. */
+   if (remaining_dwords > 0) {
+      assert(remaining_dwords < 4);
+      nir_ssa_def *cmp = nir_ieq(b, invocation_index, nir_imm_int(b, 0));
+      nir_if *if_stmt = nir_push_if(b, cmp);
+      {
+         copy_shared_to_payload(b, remaining_dwords, addr, base_shared_addr, off);
+      }
+      nir_pop_if(b, if_stmt);
+      off += remaining_dwords * 4;
+   }
+
+   assert(s->payload_offset_in_bytes + ALIGN(payload_size, 4) == off);
  }
  
  static bool
author	Marcin Ślusarz <marcin.slusarz@intel.com>
	Wed, 30 Nov 2022 12:47:19 +0000 (13:47 +0100)
committer	Marge Bot <emma+marge@anholt.net>
	Tue, 6 Dec 2022 16:31:11 +0000 (16:31 +0000)