From 86832964cf26228427bbde3f1c60a5df040e312a Mon Sep 17 00:00:00 2001
From: Qiang Yu <yuq825@gmail.com>
Date: Tue, 5 Jul 2022 15:15:16 +0800
Subject: [PATCH] radeonsi: vs load input re-calculate vertex index after
 culling
MIME-Version: 1.0
Content-Type: text/plain; charset=utf8
Content-Transfer-Encoding: 8bit

After culling, vertices are repacked, we overwrite the original
vertex_id/instance_id, so vertex index for each thread need to
be re-calculated.

Reviewed-by: Marek OlÅ¡Ã¡k <marek.olsak@amd.com>
Signed-off-by: Qiang Yu <yuq825@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/17455>
---
 src/gallium/drivers/radeonsi/si_shader_internal.h |   1 +
 src/gallium/drivers/radeonsi/si_shader_llvm.c     |   9 ++
 src/gallium/drivers/radeonsi/si_shader_llvm_vs.c  | 100 +++++++++++++++-------
 3 files changed, 77 insertions(+), 33 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_shader_internal.h b/src/gallium/drivers/radeonsi/si_shader_internal.h
index 7bfc8c8..e75de69 100644
--- a/src/gallium/drivers/radeonsi/si_shader_internal.h
+++ b/src/gallium/drivers/radeonsi/si_shader_internal.h
@@ -139,6 +139,7 @@ struct si_shader_context {
    LLVMValueRef esgs_ring;
    LLVMValueRef gsvs_ring[4];
    LLVMValueRef tess_offchip_ring;
+   LLVMValueRef instance_divisor_constbuf;
 
    LLVMValueRef gs_next_vertex[4];
    LLVMValueRef gs_curprim_verts[4];
diff --git a/src/gallium/drivers/radeonsi/si_shader_llvm.c b/src/gallium/drivers/radeonsi/si_shader_llvm.c
index e78f83c..21ff551 100644
--- a/src/gallium/drivers/radeonsi/si_shader_llvm.c
+++ b/src/gallium/drivers/radeonsi/si_shader_llvm.c
@@ -884,6 +884,15 @@ bool si_llvm_translate_nir(struct si_shader_context *ctx, struct si_shader *shad
    switch (ctx->stage) {
    case MESA_SHADER_VERTEX:
       si_llvm_init_vs_callbacks(ctx, ngg_cull_shader);
+
+      /* preload instance_divisor_constbuf to be used for input load after culling */
+      if (ctx->shader->key.ge.opt.ngg_culling &&
+          ctx->shader->key.ge.part.vs.prolog.instance_divisor_is_fetched) {
+         LLVMValueRef buf = ac_get_arg(&ctx->ac, ctx->internal_bindings);
+         ctx->instance_divisor_constbuf =
+            ac_build_load_to_sgpr(
+               &ctx->ac, buf, LLVMConstInt(ctx->ac.i32, SI_VS_CONST_INSTANCE_DIVISORS, 0));
+      }
       break;
 
    case MESA_SHADER_TESS_CTRL:
diff --git a/src/gallium/drivers/radeonsi/si_shader_llvm_vs.c b/src/gallium/drivers/radeonsi/si_shader_llvm_vs.c
index ac45d41..34140fa 100644
--- a/src/gallium/drivers/radeonsi/si_shader_llvm_vs.c
+++ b/src/gallium/drivers/radeonsi/si_shader_llvm_vs.c
@@ -39,6 +39,53 @@ static LLVMValueRef unpack_sint16(struct si_shader_context *ctx, LLVMValueRef i3
                         ctx->ac.i32, "");
 }
 
+static LLVMValueRef get_vertex_index(struct si_shader_context *ctx,
+                                     struct si_vs_prolog_bits *key, unsigned input_index,
+                                     LLVMValueRef instance_divisor_constbuf,
+                                     unsigned start_instance, unsigned base_vertex)
+{
+   LLVMValueRef instance_id = ctx->abi.instance_id_replaced ?
+      ctx->abi.instance_id_replaced : ctx->abi.instance_id;
+   LLVMValueRef vertex_id = ctx->abi.vertex_id_replaced ?
+      ctx->abi.vertex_id_replaced : ctx->abi.vertex_id;
+
+   bool divisor_is_one = key->instance_divisor_is_one & (1u << input_index);
+   bool divisor_is_fetched =key->instance_divisor_is_fetched & (1u << input_index);
+
+   LLVMValueRef index = NULL;
+   if (divisor_is_one)
+      index = instance_id;
+   else if (divisor_is_fetched) {
+      LLVMValueRef udiv_factors[4];
+
+      for (unsigned j = 0; j < 4; j++) {
+         udiv_factors[j] = si_buffer_load_const(
+            ctx, instance_divisor_constbuf,
+            LLVMConstInt(ctx->ac.i32, input_index * 16 + j * 4, 0));
+         udiv_factors[j] = ac_to_integer(&ctx->ac, udiv_factors[j]);
+      }
+
+      /* The faster NUW version doesn't work when InstanceID == UINT_MAX.
+       * Such InstanceID might not be achievable in a reasonable time though.
+       */
+      index = ac_build_fast_udiv_nuw(
+         &ctx->ac, instance_id, udiv_factors[0],
+         udiv_factors[1], udiv_factors[2], udiv_factors[3]);
+   }
+
+   if (divisor_is_one || divisor_is_fetched) {
+      /* Add StartInstance. */
+      index = LLVMBuildAdd(ctx->ac.builder, index,
+                           LLVMGetParam(ctx->main_fn, start_instance), "");
+   } else {
+      /* VertexID + BaseVertex */
+      index = LLVMBuildAdd(ctx->ac.builder, vertex_id,
+                           LLVMGetParam(ctx->main_fn, base_vertex), "");
+   }
+
+   return index;
+}
+
 static void load_input_vs(struct si_shader_context *ctx, unsigned input_index, LLVMValueRef out[4])
 {
    const struct si_shader_info *info = &ctx->shader->selector->info;
@@ -114,7 +161,7 @@ static void load_input_vs(struct si_shader_context *ctx, unsigned input_index, L
    unsigned num_vbos_in_user_sgprs = ctx->shader->selector->info.num_vbos_in_user_sgprs;
    union si_vs_fix_fetch fix_fetch;
    LLVMValueRef vb_desc;
-   LLVMValueRef vertex_index;
+   LLVMValueRef vertex_index = NULL;
    LLVMValueRef tmp;
 
    if (input_index < num_vbos_in_user_sgprs) {
@@ -125,7 +172,19 @@ static void load_input_vs(struct si_shader_context *ctx, unsigned input_index, L
                                       LLVMConstInt(ctx->ac.i32, index, 0));
    }
 
-   vertex_index = LLVMGetParam(ctx->main_fn, ctx->vertex_index0.arg_index + input_index);
+   if (ctx->abi.vertex_id_replaced) {
+      /* Only ngg culling will replace vertex_id, and ngg culling is an optimization key
+       * field, so the shader must be monolithic.
+       */
+      assert(ctx->shader->is_monolithic);
+      assert(ctx->abi.instance_id_replaced);
+
+      vertex_index = get_vertex_index(ctx, &ctx->shader->key.ge.part.vs.prolog,
+                                      input_index, ctx->instance_divisor_constbuf,
+                                      ctx->args.start_instance.arg_index,
+                                      ctx->args.base_vertex.arg_index);
+   } else
+      vertex_index = LLVMGetParam(ctx->main_fn, ctx->vertex_index0.arg_index + input_index);
 
    /* Use the open-coded implementation for all loads of doubles and
     * of dword-sized data that needs fixups. We need to insert conversion
@@ -947,6 +1006,8 @@ void si_llvm_build_vs_prolog(struct si_shader_context *ctx, union si_shader_part
 
    ctx->abi.vertex_id = input_vgprs[vertex_id_vgpr];
    ctx->abi.instance_id = input_vgprs[instance_id_vgpr];
+   ctx->abi.vertex_id_replaced = NULL;
+   ctx->abi.instance_id_replaced = NULL;
 
    /* Copy inputs to outputs. This should be no-op, as the registers match,
     * but it will prevent the compiler from overwriting them unintentionally.
@@ -978,37 +1039,10 @@ void si_llvm_build_vs_prolog(struct si_shader_context *ctx, union si_shader_part
    }
 
    for (i = 0; i < key->vs_prolog.num_inputs; i++) {
-      bool divisor_is_one = key->vs_prolog.states.instance_divisor_is_one & (1u << i);
-      bool divisor_is_fetched = key->vs_prolog.states.instance_divisor_is_fetched & (1u << i);
-      LLVMValueRef index = NULL;
-
-      if (divisor_is_one) {
-         index = ctx->abi.instance_id;
-      } else if (divisor_is_fetched) {
-         LLVMValueRef udiv_factors[4];
-
-         for (unsigned j = 0; j < 4; j++) {
-            udiv_factors[j] = si_buffer_load_const(ctx, instance_divisor_constbuf,
-                                                   LLVMConstInt(ctx->ac.i32, i * 16 + j * 4, 0));
-            udiv_factors[j] = ac_to_integer(&ctx->ac, udiv_factors[j]);
-         }
-         /* The faster NUW version doesn't work when InstanceID == UINT_MAX.
-          * Such InstanceID might not be achievable in a reasonable time though.
-          */
-         index = ac_build_fast_udiv_nuw(&ctx->ac, ctx->abi.instance_id, udiv_factors[0],
-                                        udiv_factors[1], udiv_factors[2], udiv_factors[3]);
-      }
-
-      if (divisor_is_one || divisor_is_fetched) {
-         /* Add StartInstance. */
-         index =
-            LLVMBuildAdd(ctx->ac.builder, index,
-                         LLVMGetParam(ctx->main_fn, user_sgpr_base + SI_SGPR_START_INSTANCE), "");
-      } else {
-         /* VertexID + BaseVertex */
-         index = LLVMBuildAdd(ctx->ac.builder, ctx->abi.vertex_id,
-                              LLVMGetParam(func, user_sgpr_base + SI_SGPR_BASE_VERTEX), "");
-      }
+      LLVMValueRef index = get_vertex_index(ctx, &key->vs_prolog.states, i,
+                                            instance_divisor_constbuf,
+                                            user_sgpr_base + SI_SGPR_START_INSTANCE,
+                                            user_sgpr_base + SI_SGPR_BASE_VERTEX);
 
       index = ac_to_float(&ctx->ac, index);
       ret = LLVMBuildInsertValue(ctx->ac.builder, ret, index, ctx->args.arg_count + i, "");
-- 
2.7.4