radeonsi: merge 2 conditional blocks with same condition into 1 in culling code

author Marek Olšák <marek.olsak@amd.com>

Tue, 11 May 2021 15:47:10 +0000 (11:47 -0400)

committer Marge Bot <eric+marge@anholt.net>

Mon, 21 Jun 2021 19:03:29 +0000 (19:03 +0000)
author Marek Olšák <marek.olsak@amd.com>
Tue, 11 May 2021 15:47:10 +0000 (11:47 -0400)
committer Marge Bot <eric+marge@anholt.net>
Mon, 21 Jun 2021 19:03:29 +0000 (19:03 +0000)
diff --git a/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c b/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c

index 02372d8..8f08f27 100644 (file)
--- a/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c
+++ b/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c
@@ -1168,41 +1168,18 @@ void gfx10_emit_ngg_culling_epilogue(struct ac_shader_abi *abi, unsigned max_out
     if (gfx10_ngg_export_prim_early(shader))
        gfx10_ngg_build_export_prim(ctx, NULL, LLVMBuildLoad(builder, new_vgpr0, ""));
  
-   /* Set the new ES input VGPRs. */
-   LLVMValueRef es_data[4];
-
-   for (unsigned i = 0; i < 4; i++)
-      es_data[i] = ac_build_alloca_undef(&ctx->ac, ctx->ac.i32, "");
-
-   ac_build_ifcc(&ctx->ac, LLVMBuildICmp(ctx->ac.builder, LLVMIntULT, tid, new_num_es_threads, ""),
-                 16012);
-   {
-      LLVMValueRef tmp;
-
-      for (unsigned i = 0; i < 2; i++) {
-         tmp = LLVMBuildLoad(
-            builder,
-            ac_build_gep0(&ctx->ac, es_vtxptr, LLVMConstInt(ctx->ac.i32, lds_vertex_id + i, 0)),
-            "");
-         LLVMBuildStore(builder, tmp, es_data[i]);
-      }
-
-      if (ctx->stage == MESA_SHADER_TESS_EVAL) {
-         tmp = LLVMBuildLoad(builder,
-                             si_build_gep_i8(ctx, es_vtxptr, lds_byte2_tes_rel_patch_id), "");
-         tmp = LLVMBuildZExt(builder, tmp, ctx->ac.i32, "");
-         LLVMBuildStore(builder, tmp, es_data[2]);
-
-         if (uses_tes_prim_id) {
-            tmp = LLVMBuildLoad(builder,
-                                ac_build_gep0(&ctx->ac, es_vtxptr,
-                                              LLVMConstInt(ctx->ac.i32, lds_tes_patch_id, 0)),
-                                "");
-            LLVMBuildStore(builder, tmp, es_data[3]);
-         }
+   /* Prepare LDS addresses of the new ES input VGPRs. */
+   LLVMValueRef input_vgpr_addresses[4] = {
+      ac_build_gep0(&ctx->ac, es_vtxptr, LLVMConstInt(ctx->ac.i32, lds_vertex_id, 0)),
+      ac_build_gep0(&ctx->ac, es_vtxptr, LLVMConstInt(ctx->ac.i32, lds_instance_id, 0)),
+   };
+   if (ctx->stage == MESA_SHADER_TESS_EVAL) {
+      input_vgpr_addresses[2] = si_build_gep_i8(ctx, es_vtxptr, lds_byte2_tes_rel_patch_id);
+      if (uses_tes_prim_id) {
+         input_vgpr_addresses[3] = ac_build_gep0(&ctx->ac, es_vtxptr,
+                                                 LLVMConstInt(ctx->ac.i32, lds_tes_patch_id, 0));
        }
     }
-   ac_build_endif(&ctx->ac, 16012);
  
     /* Return values for the main function. */
     LLVMValueRef ret = ctx->return_value;
@@ -1256,13 +1233,16 @@ void gfx10_emit_ngg_culling_epilogue(struct ac_shader_abi *abi, unsigned max_out
     ret = si_insert_input_ret_float(ctx, ret, ctx->args.gs_invocation_id, vgpr++);
     vgpr++; /* gs_vtx45_offset */
  
+   /* Set the input VPGRs to the corresponding LDS addresses where the VGPR values are
+    * stored. The VS prolog will load them.
+    */
     if (ctx->stage == MESA_SHADER_VERTEX) {
-      val = LLVMBuildLoad(builder, es_data[0], "");
+      val = LLVMBuildPtrToInt(builder, input_vgpr_addresses[0], ctx->ac.i32, "");
        ret = LLVMBuildInsertValue(builder, ret, ac_to_float(&ctx->ac, val), vgpr++,
                                   ""); /* VGPR5 - VertexID */
        vgpr += 2;
        if (uses_instance_id) {
-         val = LLVMBuildLoad(builder, es_data[1], "");
+         val = LLVMBuildPtrToInt(builder, input_vgpr_addresses[1], ctx->ac.i32, "");
           ret = LLVMBuildInsertValue(builder, ret, ac_to_float(&ctx->ac, val), vgpr++,
                                      ""); /* VGPR8 - InstanceID */
        } else {
@@ -1272,7 +1252,7 @@ void gfx10_emit_ngg_culling_epilogue(struct ac_shader_abi *abi, unsigned max_out
        assert(ctx->stage == MESA_SHADER_TESS_EVAL);
        unsigned num_vgprs = uses_tes_prim_id ? 4 : 3;
        for (unsigned i = 0; i < num_vgprs; i++) {
-         val = LLVMBuildLoad(builder, es_data[i], "");
+         val = LLVMBuildPtrToInt(builder, input_vgpr_addresses[i], ctx->ac.i32, "");
           ret = LLVMBuildInsertValue(builder, ret, ac_to_float(&ctx->ac, val), vgpr++, "");
        }
        if (num_vgprs == 3)
diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c

index 94dd486..ab50dbd 100644 (file)
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -1295,6 +1295,9 @@ bool si_vs_needs_prolog(const struct si_shader_selector *sel,
      * VS prolog. */
     return sel->vs_needs_prolog || prolog_key->ls_vgpr_fix ||
            prolog_key->unpack_instance_id_from_vertex_id ||
+          /* The 2nd VS prolog loads input VGPRs from LDS */
+          (key->opt.ngg_culling && !ngg_cull_shader) ||
+          /* The 1st VS prolog generates input VGPRs for fast launch. */
            (ngg_cull_shader && key->opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_ALL);
  }
  
@@ -1329,6 +1332,8 @@ void si_get_vs_prolog_key(const struct si_shader_info *info, unsigned num_input_
           !!(shader_out->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_TRI_STRIP);
        key->vs_prolog.gs_fast_launch_index_size_packed =
           SI_GET_NGG_CULL_GS_FAST_LAUNCH_INDEX_SIZE_PACKED(shader_out->key.opt.ngg_culling);
+   } else if (shader_out->key.opt.ngg_culling) {
+      key->vs_prolog.load_vgprs_after_culling = 1;
     }
  
     if (shader_out->selector->info.stage == MESA_SHADER_TESS_CTRL) {
diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h

index 44562fc..222a46c 100644 (file)
--- a/src/gallium/drivers/radeonsi/si_shader.h
+++ b/src/gallium/drivers/radeonsi/si_shader.h
@@ -582,6 +582,7 @@ union si_shader_part_key {
        unsigned gs_fast_launch_tri_list : 1;  /* for NGG culling */
        unsigned gs_fast_launch_tri_strip : 1; /* for NGG culling */
        unsigned gs_fast_launch_index_size_packed : 2;
+      unsigned load_vgprs_after_culling : 1;
        /* Prologs for monolithic shaders shouldn't set EXEC. */
        unsigned is_monolithic : 1;
     } vs_prolog;
diff --git a/src/gallium/drivers/radeonsi/si_shader_llvm.c b/src/gallium/drivers/radeonsi/si_shader_llvm.c

index 2babb7e..574d8b3 100644 (file)
--- a/src/gallium/drivers/radeonsi/si_shader_llvm.c
+++ b/src/gallium/drivers/radeonsi/si_shader_llvm.c
@@ -1090,7 +1090,7 @@ bool si_llvm_compile_shader(struct si_screen *sscreen, struct ac_llvm_compiler *
     if (shader->is_monolithic && ctx.stage == MESA_SHADER_VERTEX) {
        LLVMValueRef parts[4];
        unsigned num_parts = 0;
-      bool has_prolog = false;
+      bool first_is_prolog = false;
        LLVMValueRef main_fn = ctx.main_fn;
  
        if (ngg_cull_main_fn) {
@@ -1101,7 +1101,7 @@ bool si_llvm_compile_shader(struct si_screen *sscreen, struct ac_llvm_compiler *
              prolog_key.vs_prolog.is_monolithic = true;
              si_llvm_build_vs_prolog(&ctx, &prolog_key);
              parts[num_parts++] = ctx.main_fn;
-            has_prolog = true;
+            first_is_prolog = true;
           }
           parts[num_parts++] = ngg_cull_main_fn;
        }
@@ -1113,21 +1113,34 @@ bool si_llvm_compile_shader(struct si_screen *sscreen, struct ac_llvm_compiler *
           prolog_key.vs_prolog.is_monolithic = true;
           si_llvm_build_vs_prolog(&ctx, &prolog_key);
           parts[num_parts++] = ctx.main_fn;
-         has_prolog = true;
+         if (num_parts == 1)
+            first_is_prolog = true;
        }
        parts[num_parts++] = main_fn;
  
-      si_build_wrapper_function(&ctx, parts, num_parts, has_prolog ? 1 : 0, 0, false);
+      si_build_wrapper_function(&ctx, parts, num_parts, first_is_prolog ? 1 : 0, 0, false);
  
        if (ctx.shader->key.opt.vs_as_prim_discard_cs)
           si_build_prim_discard_compute_shader(&ctx);
     } else if (shader->is_monolithic && ctx.stage == MESA_SHADER_TESS_EVAL && ngg_cull_main_fn) {
-      LLVMValueRef parts[2];
+      LLVMValueRef parts[3], prolog, main_fn = ctx.main_fn;
+
+      /* We reuse the VS prolog code for TES just to load the input VGPRs from LDS. */
+      union si_shader_part_key prolog_key;
+      memset(&prolog_key, 0, sizeof(prolog_key));
+      prolog_key.vs_prolog.num_input_sgprs = shader->info.num_input_sgprs;
+      prolog_key.vs_prolog.num_merged_next_stage_vgprs = 5;
+      prolog_key.vs_prolog.as_ngg = 1;
+      prolog_key.vs_prolog.load_vgprs_after_culling = 1;
+      prolog_key.vs_prolog.is_monolithic = true;
+      si_llvm_build_vs_prolog(&ctx, &prolog_key);
+      prolog = ctx.main_fn;
  
        parts[0] = ngg_cull_main_fn;
-      parts[1] = ctx.main_fn;
+      parts[1] = prolog;
+      parts[2] = main_fn;
  
-      si_build_wrapper_function(&ctx, parts, 2, 0, 0, false);
+      si_build_wrapper_function(&ctx, parts, 3, 0, 0, false);
     } else if (shader->is_monolithic && ctx.stage == MESA_SHADER_TESS_CTRL) {
        if (sscreen->info.chip_class >= GFX9) {
           struct si_shader_selector *ls = shader->key.part.tcs.ls;
diff --git a/src/gallium/drivers/radeonsi/si_shader_llvm_vs.c b/src/gallium/drivers/radeonsi/si_shader_llvm_vs.c

index 89adcfa..f49e0de 100644 (file)
--- a/src/gallium/drivers/radeonsi/si_shader_llvm_vs.c
+++ b/src/gallium/drivers/radeonsi/si_shader_llvm_vs.c
@@ -887,6 +887,19 @@ void si_llvm_build_vs_prolog(struct si_shader_context *ctx, union si_shader_part
        }
     }
  
+   /* The culling code stored the LDS addresses of the VGPRs into those VGPRs. Load them. */
+   if (key->vs_prolog.load_vgprs_after_culling) {
+      for (i = 5; i <= 8; i++) {
+         bool is_tes_rel_patch_id = i == 7;
+         input_vgprs[i] = LLVMBuildIntToPtr(ctx->ac.builder, input_vgprs[i],
+                                            LLVMPointerType(is_tes_rel_patch_id ? ctx->ac.i8 : ctx->ac.i32,
+                                                            AC_ADDR_SPACE_LDS), "");
+         input_vgprs[i] = LLVMBuildLoad(ctx->ac.builder, input_vgprs[i], "");
+         if (is_tes_rel_patch_id)
+            input_vgprs[i] = LLVMBuildZExt(ctx->ac.builder, input_vgprs[i], ctx->ac.i32, "");
+      }
+   }
+
     if (key->vs_prolog.gs_fast_launch_tri_list || key->vs_prolog.gs_fast_launch_tri_strip) {
        LLVMValueRef wave_id, thread_id_in_tg;
author	Marek Olšák <marek.olsak@amd.com>
	Tue, 11 May 2021 15:47:10 +0000 (11:47 -0400)
committer	Marge Bot <eric+marge@anholt.net>
	Mon, 21 Jun 2021 19:03:29 +0000 (19:03 +0000)
src/gallium/drivers/radeonsi/gfx10_shader_ngg.c		patch \| blob \| history
src/gallium/drivers/radeonsi/si_shader.c		patch \| blob \| history
src/gallium/drivers/radeonsi/si_shader.h		patch \| blob \| history
src/gallium/drivers/radeonsi/si_shader_llvm.c		patch \| blob \| history
src/gallium/drivers/radeonsi/si_shader_llvm_vs.c		patch \| blob \| history