radeonsi: support upload multi part shader binary
authorQiang Yu <yuq825@gmail.com>
Thu, 20 Jul 2023 08:53:55 +0000 (16:53 +0800)
committerQiang Yu <yuq825@gmail.com>
Wed, 16 Aug 2023 03:25:28 +0000 (11:25 +0800)
Need to split shader binary into exec and data part, then combine
exec and data of all shader parts separately. So const data symbols
in code need to be relocated.

Reviewed-by: Marek Olšák <marek.olsak@amd.com>
Signed-off-by: Qiang Yu <yuq825@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/24443>

src/gallium/drivers/radeonsi/si_shader.c
src/gallium/drivers/radeonsi/si_shader_aco.c
src/gallium/drivers/radeonsi/si_shader_internal.h

index e624f596173d416baa3f686dc46a785abae37d08..f93dad3acc1dc97c6ac1231948a63d459b3094a9 100644 (file)
@@ -830,6 +830,24 @@ bool si_shader_binary_open(struct si_screen *screen, struct si_shader *shader,
    return ok;
 }
 
+static unsigned get_shader_binaries(struct si_shader *shader, struct si_shader_binary *bin[4])
+{
+   unsigned num_bin = 0;
+
+   if (shader->prolog)
+      bin[num_bin++] = &shader->prolog->binary;
+
+   if (shader->previous_stage)
+      bin[num_bin++] = &shader->previous_stage->binary;
+
+   bin[num_bin++] = &shader->binary;
+
+   if (shader->epilog)
+      bin[num_bin++] = &shader->epilog->binary;
+
+   return num_bin;
+}
+
 static unsigned si_get_shader_binary_size(struct si_screen *screen, struct si_shader *shader)
 {
    if (shader->binary.type == SI_SHADER_BINARY_ELF) {
@@ -839,8 +857,15 @@ static unsigned si_get_shader_binary_size(struct si_screen *screen, struct si_sh
       ac_rtld_close(&rtld);
       return size;
    } else {
-      assert(shader->binary.type == SI_SHADER_BINARY_RAW);
-      return shader->binary.code_size;
+      struct si_shader_binary *bin[4];
+      unsigned num_bin = get_shader_binaries(shader, bin);
+
+      unsigned size = 0;
+      for (unsigned i = 0; i < num_bin; i++) {
+         assert(bin[i]->type == SI_SHADER_BINARY_RAW);
+         size += bin[i]->exec_size;
+      }
+      return size;
    }
 }
 
@@ -942,8 +967,17 @@ static void calculate_needed_lds_size(struct si_screen *sscreen, struct si_shade
 static bool upload_binary_raw(struct si_screen *sscreen, struct si_shader *shader,
                               uint64_t scratch_va)
 {
-   unsigned rx_size =
-      ac_align_shader_binary_for_prefetch(&sscreen->info, shader->binary.code_size);
+   struct si_shader_binary *bin[4];
+   unsigned num_bin = get_shader_binaries(shader, bin);
+
+   unsigned code_size = 0, exec_size = 0;
+   for (unsigned i = 0; i < num_bin; i++) {
+      assert(bin[i]->type == SI_SHADER_BINARY_RAW);
+      code_size += bin[i]->code_size;
+      exec_size += bin[i]->exec_size;
+   }
+
+   unsigned rx_size = ac_align_shader_binary_for_prefetch(&sscreen->info, code_size);
 
    si_resource_reference(&shader->bo, NULL);
    shader->bo =
@@ -965,9 +999,32 @@ static bool upload_binary_raw(struct si_screen *sscreen, struct si_shader *shade
    if (!rx_ptr)
       return false;
 
-   memcpy(rx_ptr, shader->binary.code_buffer, shader->binary.code_size);
+   unsigned exec_offset = 0, data_offset = exec_size;
+   for (unsigned i = 0; i < num_bin; i++) {
+      memcpy(rx_ptr + exec_offset, bin[i]->code_buffer, bin[i]->exec_size);
+
+      if (bin[i]->num_symbols) {
+         /* Offset needed to add to const data symbol because of inserting other
+          * shader part between exec code and const data.
+          */
+         unsigned const_offset = data_offset - exec_offset - bin[i]->exec_size;
+
+         /* Prolog and epilog have no symbols. */
+         struct si_shader *sh = bin[i] == &shader->binary ? shader : shader->previous_stage;
+         assert(sh && bin[i] == &sh->binary);
 
-   si_aco_resolve_symbols(shader, rx_ptr, scratch_va);
+         si_aco_resolve_symbols(sh, rx_ptr + exec_offset, (const uint32_t *)bin[i]->code_buffer,
+                                scratch_va, const_offset);
+      }
+
+      exec_offset += bin[i]->exec_size;
+
+      unsigned data_size = bin[i]->code_size - bin[i]->exec_size;
+      if (data_size) {
+         memcpy(rx_ptr + data_offset, bin[i]->code_buffer + bin[i]->exec_size, data_size);
+         data_offset += data_size;
+      }
+   }
 
    sscreen->ws->buffer_unmap(sscreen->ws, shader->bo->buf);
    shader->gpu_address = shader->bo->gpu_address;
index 4e1f371e7fa0945352d665f2b269a20174467ce9..62b300968f064ba1a98075e59857c6f009812f01 100644 (file)
@@ -177,7 +177,8 @@ si_aco_compile_shader(struct si_shader *shader,
 }
 
 void
-si_aco_resolve_symbols(struct si_shader *shader, uint32_t *code, uint64_t scratch_va)
+si_aco_resolve_symbols(struct si_shader *shader, uint32_t *code_for_write,
+                       const uint32_t *code_for_read, uint64_t scratch_va, uint32_t const_offset)
 {
    const struct aco_symbol *symbols = (struct aco_symbol *)shader->binary.symbols;
    const struct si_shader_selector *sel = shader->selector;
@@ -210,12 +211,15 @@ si_aco_resolve_symbols(struct si_shader *shader, uint32_t *code, uint64_t scratc
          value = shader->gs_info.esgs_ring_size * 4;
          break;
       case aco_symbol_const_data_addr:
-         continue;
+         if (!const_offset)
+            continue;
+         value = code_for_read[symbols[i].offset] + const_offset;
+         break;
       default:
          unreachable("invalid aco symbol");
          break;
       }
 
-      code[symbols[i].offset] = value;
+      code_for_write[symbols[i].offset] = value;
    }
 }
index ebf3b5c14e0874a8ca8dfa9454bf8b34ab438ff6..d5e00987a8fd3914c67789005e1bce934c6e3e43 100644 (file)
@@ -215,6 +215,8 @@ bool si_aco_compile_shader(struct si_shader *shader,
                            struct si_shader_args *args,
                            struct nir_shader *nir,
                            struct util_debug_callback *debug);
-void si_aco_resolve_symbols(struct si_shader *shader, uint32_t *code, uint64_t scratch_va);
+void si_aco_resolve_symbols(struct si_shader *shader, uint32_t *code_for_write,
+                            const uint32_t *code_for_read, uint64_t scratch_va,
+                            uint32_t const_offset);
 
 #endif