radeonsi: try to keep all VS input loads together for better perf

author Marek Olšák <marek.olsak@amd.com>

Mon, 24 May 2021 23:42:17 +0000 (19:42 -0400)

committer Marge Bot <eric+marge@anholt.net>

Mon, 21 Jun 2021 19:03:29 +0000 (19:03 +0000)
author Marek Olšák <marek.olsak@amd.com>
Mon, 24 May 2021 23:42:17 +0000 (19:42 -0400)
committer Marge Bot <eric+marge@anholt.net>
Mon, 21 Jun 2021 19:03:29 +0000 (19:03 +0000)
diff --git a/src/gallium/drivers/radeonsi/si_shader_llvm_vs.c b/src/gallium/drivers/radeonsi/si_shader_llvm_vs.c

index f49e0de..0d07aeb 100644 (file)
--- a/src/gallium/drivers/radeonsi/si_shader_llvm_vs.c
+++ b/src/gallium/drivers/radeonsi/si_shader_llvm_vs.c
@@ -95,6 +95,18 @@ static void load_input_vs(struct si_shader_context *ctx, unsigned input_index, L
        return;
     }
  
+   /* Set can_speculate=false to help keep all loads grouped together
+    * for better latency hiding. If it was true, LLVM could move the loads forward
+    * and accidentally double memory latency by doing:
+    *
+    *    buffer_load_dword_xyzw
+    *    s_waitcnt vmcnt(0)
+    *    buffer_load_dword_xyzw
+    *    s_waitcnt vmcnt(0)
+    *
+    * ... which is what we must prevent at all cost.
+    */
+   const bool can_speculate = false;
     unsigned bit_size = info->input_fp16_lo_hi_valid[input_index] & 0x1 ? 16 : 32;
     LLVMTypeRef int_type = bit_size == 16 ? ctx->ac.i16 : ctx->ac.i32;
     LLVMTypeRef float_type = bit_size == 16 ? ctx->ac.f16 : ctx->ac.f32;
@@ -125,7 +137,7 @@ static void load_input_vs(struct si_shader_context *ctx, unsigned input_index, L
        tmp = ac_build_opencoded_load_format(&ctx->ac, fix_fetch.u.log_size,
                                             fix_fetch.u.num_channels_m1 + 1, fix_fetch.u.format,
                                             fix_fetch.u.reverse, !opencode, vb_desc, vertex_index,
-                                           ctx->ac.i32_0, ctx->ac.i32_0, 0, true);
+                                           ctx->ac.i32_0, ctx->ac.i32_0, 0, can_speculate);
        for (unsigned i = 0; i < 4; ++i)
           out[i] =
              LLVMBuildExtractElement(ctx->ac.builder, tmp, LLVMConstInt(ctx->ac.i32, i, false), "");
@@ -171,7 +183,7 @@ static void load_input_vs(struct si_shader_context *ctx, unsigned input_index, L
     for (unsigned i = 0; i < num_fetches; ++i) {
        LLVMValueRef voffset = LLVMConstInt(ctx->ac.i32, fetch_stride * i, 0);
        fetches[i] = ac_build_buffer_load_format(&ctx->ac, vb_desc, vertex_index, voffset,
-                                               channels_per_fetch, 0, true,
+                                               channels_per_fetch, 0, can_speculate,
                                                 bit_size == 16, false);
     }
author	Marek Olšák <marek.olsak@amd.com>
	Mon, 24 May 2021 23:42:17 +0000 (19:42 -0400)
committer	Marge Bot <eric+marge@anholt.net>
	Mon, 21 Jun 2021 19:03:29 +0000 (19:03 +0000)