radeonsi: if VS and TCS have the same number of threads, merge the conditonals

author Marek Olšák <marek.olsak@amd.com>

Sat, 14 Nov 2020 06:12:29 +0000 (01:12 -0500)

committer Marge Bot <eric+marge@anholt.net>

Mon, 23 Nov 2020 02:22:21 +0000 (02:22 +0000)
author Marek Olšák <marek.olsak@amd.com>
Sat, 14 Nov 2020 06:12:29 +0000 (01:12 -0500)
committer Marge Bot <eric+marge@anholt.net>
Mon, 23 Nov 2020 02:22:21 +0000 (02:22 +0000)
diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h

index 8b7e8f2..e7d5753 100644 (file)
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -1111,6 +1111,7 @@ struct si_context {
     bool ls_vgpr_fix : 1;
     bool prim_discard_cs_instancing : 1;
     bool ngg : 1;
+   bool same_patch_vertices : 1;
     uint8_t ngg_culling;
     int last_index_size;
     int last_base_vertex;
diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c

index fbeed39..ec748c9 100644 (file)
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -464,7 +464,8 @@ void si_create_function(struct si_shader_context *ctx, bool ngg_cull_shader)
        ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_layout);
        ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_out_lds_offsets);
        ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_out_lds_layout);
-      declare_vb_descriptor_input_sgprs(ctx);
+      if (ctx->stage == MESA_SHADER_VERTEX)
+         declare_vb_descriptor_input_sgprs(ctx);
  
        /* VGPRs (first TCS, then VS) */
        ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.tcs_patch_id);
@@ -1212,6 +1213,8 @@ static void si_dump_shader_key(const struct si_shader *shader, FILE *f)
        fprintf(f, "  part.tcs.epilog.prim_mode = %u\n", key->part.tcs.epilog.prim_mode);
        fprintf(f, "  mono.u.ff_tcs_inputs_to_copy = 0x%" PRIx64 "\n",
                key->mono.u.ff_tcs_inputs_to_copy);
+      fprintf(f, "  opt.prefer_mono = %u\n", key->opt.prefer_mono);
+      fprintf(f, "  opt.same_patch_vertices = %u\n", key->opt.same_patch_vertices);
        break;
  
     case MESA_SHADER_TESS_EVAL:
@@ -1733,7 +1736,7 @@ static bool si_llvm_compile_shader(struct si_screen *sscreen, struct ac_llvm_com
        }
        parts[num_parts++] = main_fn;
  
-      si_build_wrapper_function(&ctx, parts, num_parts, has_prolog ? 1 : 0, 0);
+      si_build_wrapper_function(&ctx, parts, num_parts, has_prolog ? 1 : 0, 0, false);
  
        if (ctx.shader->key.opt.vs_as_prim_discard_cs)
           si_build_prim_discard_compute_shader(&ctx);
@@ -1743,7 +1746,7 @@ static bool si_llvm_compile_shader(struct si_screen *sscreen, struct ac_llvm_com
        parts[0] = ngg_cull_main_fn;
        parts[1] = ctx.main_fn;
  
-      si_build_wrapper_function(&ctx, parts, 2, 0, 0);
+      si_build_wrapper_function(&ctx, parts, 2, 0, 0, false);
     } else if (shader->is_monolithic && ctx.stage == MESA_SHADER_TESS_CTRL) {
        if (sscreen->info.chip_class >= GFX9) {
           struct si_shader_selector *ls = shader->key.part.tcs.ls;
@@ -1792,7 +1795,8 @@ static bool si_llvm_compile_shader(struct si_screen *sscreen, struct ac_llvm_com
           ctx.stage = MESA_SHADER_TESS_CTRL;
  
           si_build_wrapper_function(&ctx, parts + !vs_needs_prolog, 4 - !vs_needs_prolog,
-                                   vs_needs_prolog, vs_needs_prolog ? 2 : 1);
+                                   vs_needs_prolog, vs_needs_prolog ? 2 : 1,
+                                   shader->key.opt.same_patch_vertices);
        } else {
           LLVMValueRef parts[2];
           union si_shader_part_key epilog_key;
@@ -1804,7 +1808,7 @@ static bool si_llvm_compile_shader(struct si_screen *sscreen, struct ac_llvm_com
           si_llvm_build_tcs_epilog(&ctx, &epilog_key);
           parts[1] = ctx.main_fn;
  
-         si_build_wrapper_function(&ctx, parts, 2, 0, 0);
+         si_build_wrapper_function(&ctx, parts, 2, 0, 0, false);
        }
     } else if (shader->is_monolithic && ctx.stage == MESA_SHADER_GEOMETRY) {
        if (ctx.screen->info.chip_class >= GFX9) {
@@ -1866,7 +1870,7 @@ static bool si_llvm_compile_shader(struct si_screen *sscreen, struct ac_llvm_com
           parts[next_first_part = num_parts++] = gs_prolog;
           parts[num_parts++] = gs_main;
  
-         si_build_wrapper_function(&ctx, parts, num_parts, main_part, next_first_part);
+         si_build_wrapper_function(&ctx, parts, num_parts, main_part, next_first_part, false);
        } else {
           LLVMValueRef parts[2];
           union si_shader_part_key prolog_key;
@@ -1878,7 +1882,7 @@ static bool si_llvm_compile_shader(struct si_screen *sscreen, struct ac_llvm_com
           si_llvm_build_gs_prolog(&ctx, &prolog_key);
           parts[0] = ctx.main_fn;
  
-         si_build_wrapper_function(&ctx, parts, 2, 1, 0);
+         si_build_wrapper_function(&ctx, parts, 2, 1, 0, false);
        }
     } else if (shader->is_monolithic && ctx.stage == MESA_SHADER_FRAGMENT) {
        si_llvm_build_monolithic_ps(&ctx, shader);
diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h

index 1fe9263..c9aa439 100644 (file)
--- a/src/gallium/drivers/radeonsi/si_shader.h
+++ b/src/gallium/drivers/radeonsi/si_shader.h
@@ -673,6 +673,10 @@ struct si_shader_key {
        unsigned cs_cull_back : 1;
        unsigned cs_cull_z : 1;
        unsigned cs_halfz_clip_space : 1;
+
+      /* VS and TCS have the same number of patch vertices. */
+      unsigned same_patch_vertices:1;
+
        unsigned inline_uniforms:1;
  
        uint32_t inlined_uniform_values[MAX_INLINABLE_UNIFORMS];
diff --git a/src/gallium/drivers/radeonsi/si_shader_internal.h b/src/gallium/drivers/radeonsi/si_shader_internal.h

index 6722e58..38c5ab9 100644 (file)
--- a/src/gallium/drivers/radeonsi/si_shader_internal.h
+++ b/src/gallium/drivers/radeonsi/si_shader_internal.h
@@ -252,7 +252,7 @@ void si_llvm_declare_compute_memory(struct si_shader_context *ctx);
  bool si_nir_build_llvm(struct si_shader_context *ctx, struct nir_shader *nir);
  void si_build_wrapper_function(struct si_shader_context *ctx, LLVMValueRef *parts,
                                 unsigned num_parts, unsigned main_part,
-                               unsigned next_shader_first_part);
+                               unsigned next_shader_first_part, bool same_thread_count);
  
  /* si_shader_llvm_gs.c */
  LLVMValueRef si_is_es_thread(struct si_shader_context *ctx);
diff --git a/src/gallium/drivers/radeonsi/si_shader_llvm.c b/src/gallium/drivers/radeonsi/si_shader_llvm.c

index 3a69d1d..bb13d6d 100644 (file)
--- a/src/gallium/drivers/radeonsi/si_shader_llvm.c
+++ b/src/gallium/drivers/radeonsi/si_shader_llvm.c
@@ -456,7 +456,7 @@ bool si_nir_build_llvm(struct si_shader_context *ctx, struct nir_shader *nir)
   */
  void si_build_wrapper_function(struct si_shader_context *ctx, LLVMValueRef *parts,
                                 unsigned num_parts, unsigned main_part,
-                               unsigned next_shader_first_part)
+                               unsigned next_shader_first_part, bool same_thread_count)
  {
     LLVMBuilderRef builder = ctx->ac.builder;
     /* PS epilog has one arg per color component; gfx9 merged shader
@@ -559,7 +559,7 @@ void si_build_wrapper_function(struct si_shader_context *ctx, LLVMValueRef *part
     si_llvm_create_func(ctx, "wrapper", returns, num_returns,
                         si_get_max_workgroup_size(ctx->shader));
  
-   if (si_is_merged_shader(ctx->shader))
+   if (si_is_merged_shader(ctx->shader) && !same_thread_count)
        ac_init_exec_full_mask(&ctx->ac);
  
     /* Record the arguments of the function as if they were an output of
@@ -618,11 +618,19 @@ void si_build_wrapper_function(struct si_shader_context *ctx, LLVMValueRef *part
        /* Merged shaders are executed conditionally depending
         * on the number of enabled threads passed in the input SGPRs. */
        if (si_is_multi_part_shader(ctx->shader) && part == 0) {
-         LLVMValueRef ena, count = initial[3];
+         if (same_thread_count) {
+            struct ac_arg arg;
+            arg.arg_index = 3;
+            arg.used = true;
  
-         count = LLVMBuildAnd(builder, count, LLVMConstInt(ctx->ac.i32, 0x7f, 0), "");
-         ena = LLVMBuildICmp(builder, LLVMIntULT, ac_get_thread_id(&ctx->ac), count, "");
-         ac_build_ifcc(&ctx->ac, ena, 6506);
+            si_init_exec_from_input(ctx, arg, 0);
+         } else {
+            LLVMValueRef ena, count = initial[3];
+
+            count = LLVMBuildAnd(builder, count, LLVMConstInt(ctx->ac.i32, 0x7f, 0), "");
+            ena = LLVMBuildICmp(builder, LLVMIntULT, ac_get_thread_id(&ctx->ac), count, "");
+            ac_build_ifcc(&ctx->ac, ena, 6506);
+         }
        }
  
        /* Derive arguments for the next part from outputs of the
@@ -675,7 +683,8 @@ void si_build_wrapper_function(struct si_shader_context *ctx, LLVMValueRef *part
  
        ret = ac_build_call(&ctx->ac, parts[part], in, num_params);
  
-      if (si_is_multi_part_shader(ctx->shader) && part + 1 == next_shader_first_part) {
+      if (!same_thread_count &&
+          si_is_multi_part_shader(ctx->shader) && part + 1 == next_shader_first_part) {
           ac_build_endif(&ctx->ac, 6506);
  
           /* The second half of the merged shader should use
@@ -729,7 +738,8 @@ void si_build_wrapper_function(struct si_shader_context *ctx, LLVMValueRef *part
     }
  
     /* Close the conditional wrapping the second shader. */
-   if (ctx->stage == MESA_SHADER_TESS_CTRL && si_is_multi_part_shader(ctx->shader))
+   if (ctx->stage == MESA_SHADER_TESS_CTRL &&
+       !same_thread_count && si_is_multi_part_shader(ctx->shader))
        ac_build_endif(&ctx->ac, 6507);
  
     assert(LLVMGetTypeKind(LLVMTypeOf(ret)) == LLVMVoidTypeKind);
diff --git a/src/gallium/drivers/radeonsi/si_shader_llvm_ps.c b/src/gallium/drivers/radeonsi/si_shader_llvm_ps.c

index af662eb..dd5f647 100644 (file)
--- a/src/gallium/drivers/radeonsi/si_shader_llvm_ps.c
+++ b/src/gallium/drivers/radeonsi/si_shader_llvm_ps.c
@@ -988,7 +988,7 @@ void si_llvm_build_monolithic_ps(struct si_shader_context *ctx, struct si_shader
     si_llvm_build_ps_epilog(ctx, &epilog_key);
     parts[num_parts++] = ctx->main_fn;
  
-   si_build_wrapper_function(ctx, parts, num_parts, main_index, 0);
+   si_build_wrapper_function(ctx, parts, num_parts, main_index, 0, false);
  }
  
  void si_llvm_init_ps_callbacks(struct si_shader_context *ctx)
diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c b/src/gallium/drivers/radeonsi/si_state_draw.c

index beff65f..8a72921 100644 (file)
--- a/src/gallium/drivers/radeonsi/si_state_draw.c
+++ b/src/gallium/drivers/radeonsi/si_state_draw.c
@@ -1890,22 +1890,35 @@ static void si_draw_vbo(struct pipe_context *ctx,
        sctx->do_update_shaders = true;
     }
  
-   if (sctx->tes_shader.cso && sctx->screen->info.has_ls_vgpr_init_bug) {
-      /* Determine whether the LS VGPR fix should be applied.
-       *
-       * It is only required when num input CPs > num output CPs,
-       * which cannot happen with the fixed function TCS. We should
-       * also update this bit when switching from TCS to fixed
-       * function TCS.
-       */
+   if (sctx->tes_shader.cso) {
        struct si_shader_selector *tcs = sctx->tcs_shader.cso;
-      bool ls_vgpr_fix =
-         tcs && info->vertices_per_patch > tcs->info.base.tess.tcs_vertices_out;
  
-      if (ls_vgpr_fix != sctx->ls_vgpr_fix) {
-         sctx->ls_vgpr_fix = ls_vgpr_fix;
+      /* The rarely occuring tcs == NULL case is not optimized. */
+      bool same_patch_vertices =
+         sctx->chip_class >= GFX9 &&
+         tcs && info->vertices_per_patch == tcs->info.base.tess.tcs_vertices_out;
+
+      if (sctx->same_patch_vertices != same_patch_vertices) {
+         sctx->same_patch_vertices = same_patch_vertices;
           sctx->do_update_shaders = true;
        }
+
+      if (sctx->screen->info.has_ls_vgpr_init_bug) {
+         /* Determine whether the LS VGPR fix should be applied.
+          *
+          * It is only required when num input CPs > num output CPs,
+          * which cannot happen with the fixed function TCS. We should
+          * also update this bit when switching from TCS to fixed
+          * function TCS.
+          */
+         bool ls_vgpr_fix =
+            tcs && info->vertices_per_patch > tcs->info.base.tess.tcs_vertices_out;
+
+         if (ls_vgpr_fix != sctx->ls_vgpr_fix) {
+            sctx->ls_vgpr_fix = ls_vgpr_fix;
+            sctx->do_update_shaders = true;
+         }
+      }
     }
  
     if (sctx->chip_class <= GFX9 && sctx->gs_shader.cso) {
diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c

index 8ea6905..cd21685 100644 (file)
--- a/src/gallium/drivers/radeonsi/si_state_shaders.c
+++ b/src/gallium/drivers/radeonsi/si_state_shaders.c
@@ -1884,6 +1884,7 @@ static inline void si_shader_selector_key(struct pipe_context *ctx, struct si_sh
            * The LS VGPR fix prefers this too.
            */
           key->opt.prefer_mono = 1;
+         key->opt.same_patch_vertices = sctx->same_patch_vertices;
        }
  
        key->part.tcs.epilog.prim_mode =
author	Marek Olšák <marek.olsak@amd.com>
	Sat, 14 Nov 2020 06:12:29 +0000 (01:12 -0500)
committer	Marge Bot <eric+marge@anholt.net>
	Mon, 23 Nov 2020 02:22:21 +0000 (02:22 +0000)
src/gallium/drivers/radeonsi/si_pipe.h		patch \| blob \| history
src/gallium/drivers/radeonsi/si_shader.c		patch \| blob \| history
src/gallium/drivers/radeonsi/si_shader.h		patch \| blob \| history
src/gallium/drivers/radeonsi/si_shader_internal.h		patch \| blob \| history
src/gallium/drivers/radeonsi/si_shader_llvm.c		patch \| blob \| history
src/gallium/drivers/radeonsi/si_shader_llvm_ps.c		patch \| blob \| history
src/gallium/drivers/radeonsi/si_state_draw.c		patch \| blob \| history
src/gallium/drivers/radeonsi/si_state_shaders.c		patch \| blob \| history