From b35b5926c9b077087bcfc614ecd4203b710a57aa Mon Sep 17 00:00:00 2001 From: =?utf8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Mon, 24 May 2021 19:42:17 -0400 Subject: [PATCH] radeonsi: try to keep all VS input loads together for better perf Acked-by: Pierre-Eric Pelloux-Prayer Part-of: --- src/gallium/drivers/radeonsi/si_shader_llvm_vs.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/src/gallium/drivers/radeonsi/si_shader_llvm_vs.c b/src/gallium/drivers/radeonsi/si_shader_llvm_vs.c index f49e0de..0d07aeb 100644 --- a/src/gallium/drivers/radeonsi/si_shader_llvm_vs.c +++ b/src/gallium/drivers/radeonsi/si_shader_llvm_vs.c @@ -95,6 +95,18 @@ static void load_input_vs(struct si_shader_context *ctx, unsigned input_index, L return; } + /* Set can_speculate=false to help keep all loads grouped together + * for better latency hiding. If it was true, LLVM could move the loads forward + * and accidentally double memory latency by doing: + * + * buffer_load_dword_xyzw + * s_waitcnt vmcnt(0) + * buffer_load_dword_xyzw + * s_waitcnt vmcnt(0) + * + * ... which is what we must prevent at all cost. + */ + const bool can_speculate = false; unsigned bit_size = info->input_fp16_lo_hi_valid[input_index] & 0x1 ? 16 : 32; LLVMTypeRef int_type = bit_size == 16 ? ctx->ac.i16 : ctx->ac.i32; LLVMTypeRef float_type = bit_size == 16 ? ctx->ac.f16 : ctx->ac.f32; @@ -125,7 +137,7 @@ static void load_input_vs(struct si_shader_context *ctx, unsigned input_index, L tmp = ac_build_opencoded_load_format(&ctx->ac, fix_fetch.u.log_size, fix_fetch.u.num_channels_m1 + 1, fix_fetch.u.format, fix_fetch.u.reverse, !opencode, vb_desc, vertex_index, - ctx->ac.i32_0, ctx->ac.i32_0, 0, true); + ctx->ac.i32_0, ctx->ac.i32_0, 0, can_speculate); for (unsigned i = 0; i < 4; ++i) out[i] = LLVMBuildExtractElement(ctx->ac.builder, tmp, LLVMConstInt(ctx->ac.i32, i, false), ""); @@ -171,7 +183,7 @@ static void load_input_vs(struct si_shader_context *ctx, unsigned input_index, L for (unsigned i = 0; i < num_fetches; ++i) { LLVMValueRef voffset = LLVMConstInt(ctx->ac.i32, fetch_stride * i, 0); fetches[i] = ac_build_buffer_load_format(&ctx->ac, vb_desc, vertex_index, voffset, - channels_per_fetch, 0, true, + channels_per_fetch, 0, can_speculate, bit_size == 16, false); } -- 2.7.4