aco: add ps prolog generation for radeonsi
authorQiang Yu <yuq825@gmail.com>
Wed, 9 Aug 2023 07:07:39 +0000 (15:07 +0800)
committerMarge Bot <emma+marge@anholt.net>
Tue, 10 Oct 2023 02:36:33 +0000 (02:36 +0000)
Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
Signed-off-by: Qiang Yu <yuq825@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/24973>

src/amd/compiler/aco_instruction_selection.cpp
src/amd/compiler/aco_interface.cpp
src/amd/compiler/aco_interface.h
src/amd/compiler/aco_ir.h
src/amd/compiler/aco_shader_info.h

index 313310f..e8d1985 100644 (file)
@@ -1105,6 +1105,22 @@ emit_boolean_logic(isel_context* ctx, nir_alu_instr* instr, Builder::WaveSpecifi
 }
 
 void
+select_vec2(isel_context* ctx, Temp dst, Temp cond, Temp then, Temp els)
+{
+   Builder bld(ctx->program, ctx->block);
+
+   Temp then_lo = bld.tmp(v1), then_hi = bld.tmp(v1);
+   bld.pseudo(aco_opcode::p_split_vector, Definition(then_lo), Definition(then_hi), then);
+   Temp else_lo = bld.tmp(v1), else_hi = bld.tmp(v1);
+   bld.pseudo(aco_opcode::p_split_vector, Definition(else_lo), Definition(else_hi), els);
+
+   Temp dst0 = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), else_lo, then_lo, cond);
+   Temp dst1 = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), else_hi, then_hi, cond);
+
+   bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
+}
+
+void
 emit_bcsel(isel_context* ctx, nir_alu_instr* instr, Temp dst)
 {
    Builder bld(ctx->program, ctx->block);
@@ -1122,15 +1138,7 @@ emit_bcsel(isel_context* ctx, nir_alu_instr* instr, Temp dst)
 
          bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), els, then, cond);
       } else if (dst.size() == 2) {
-         Temp then_lo = bld.tmp(v1), then_hi = bld.tmp(v1);
-         bld.pseudo(aco_opcode::p_split_vector, Definition(then_lo), Definition(then_hi), then);
-         Temp else_lo = bld.tmp(v1), else_hi = bld.tmp(v1);
-         bld.pseudo(aco_opcode::p_split_vector, Definition(else_lo), Definition(else_hi), els);
-
-         Temp dst0 = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), else_lo, then_lo, cond);
-         Temp dst1 = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), else_hi, then_hi, cond);
-
-         bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
+         select_vec2(ctx, dst, cond, then, els);
       } else {
          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
       }
@@ -11829,6 +11837,208 @@ get_gl_vs_prolog_vertex_index(isel_context* ctx, const struct aco_gl_vs_prolog_i
    return index;
 }
 
+void
+emit_polygon_stipple(isel_context* ctx, const struct aco_ps_prolog_info* finfo)
+{
+   Builder bld(ctx->program, ctx->block);
+
+   /* Use the fixed-point gl_FragCoord input.
+    * Since the stipple pattern is 32x32 and it repeats, just get 5 bits
+    * per coordinate to get the repeating effect.
+    */
+   Temp pos_fixed_pt = get_arg(ctx, ctx->args->pos_fixed_pt);
+   Temp addr0 = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(0x1f), pos_fixed_pt);
+   Temp addr1 = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), pos_fixed_pt, Operand::c32(16u),
+                         Operand::c32(5u));
+
+   /* Load the buffer descriptor. */
+   Temp list = get_arg(ctx, finfo->internal_bindings);
+   list = convert_pointer_to_64_bit(ctx, list);
+   Temp desc = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), list,
+                        Operand::c32(finfo->poly_stipple_buf_offset));
+
+   /* The stipple pattern is 32x32, each row has 32 bits. */
+   Temp offset = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(2), addr1);
+   Temp row = bld.mubuf(aco_opcode::buffer_load_dword, bld.def(v1), desc, offset, Operand::c32(0u),
+                        0, true);
+   Temp bit = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), row, addr0, Operand::c32(1u));
+   Temp cond = bld.vopc(aco_opcode::v_cmp_eq_u32, bld.def(bld.lm), Operand::zero(), bit);
+   bld.pseudo(aco_opcode::p_demote_to_helper, cond);
+
+   ctx->block->kind |= block_kind_uses_discard;
+   ctx->program->needs_exact = true;
+}
+
+void
+overwrite_interp_args(isel_context* ctx, const struct aco_ps_prolog_info* finfo)
+{
+   Builder bld(ctx->program, ctx->block);
+
+   if (finfo->bc_optimize_for_persp || finfo->bc_optimize_for_linear) {
+      /* The shader should do: if (PRIM_MASK[31]) CENTROID = CENTER;
+       * The hw doesn't compute CENTROID if the whole wave only
+       * contains fully-covered quads.
+       */
+      Temp bc_optimize = get_arg(ctx, ctx->args->prim_mask);
+
+      /* enabled when bit 31 is set */
+      Temp cond =
+         bld.sopc(aco_opcode::s_bitcmp1_b32, bld.def(s1, scc), bc_optimize, Operand::c32(31u));
+
+      /* scale 1bit scc to wave size bits used by v_cndmask */
+      cond = bool_to_vector_condition(ctx, cond);
+
+      if (finfo->bc_optimize_for_persp) {
+         Temp center = get_arg(ctx, ctx->args->persp_center);
+         Temp centroid = get_arg(ctx, ctx->args->persp_centroid);
+
+         Temp dst = bld.tmp(v2);
+         select_vec2(ctx, dst, cond, center, centroid);
+         ctx->arg_temps[ctx->args->persp_centroid.arg_index] = dst;
+      }
+
+      if (finfo->bc_optimize_for_linear) {
+         Temp center = get_arg(ctx, ctx->args->linear_center);
+         Temp centroid = get_arg(ctx, ctx->args->linear_centroid);
+
+         Temp dst = bld.tmp(v2);
+         select_vec2(ctx, dst, cond, center, centroid);
+         ctx->arg_temps[ctx->args->linear_centroid.arg_index] = dst;
+      }
+   }
+
+   if (finfo->force_persp_sample_interp) {
+      Temp persp_sample = get_arg(ctx, ctx->args->persp_sample);
+      ctx->arg_temps[ctx->args->persp_center.arg_index] = persp_sample;
+      ctx->arg_temps[ctx->args->persp_centroid.arg_index] = persp_sample;
+   }
+
+   if (finfo->force_linear_sample_interp) {
+      Temp linear_sample = get_arg(ctx, ctx->args->linear_sample);
+      ctx->arg_temps[ctx->args->linear_center.arg_index] = linear_sample;
+      ctx->arg_temps[ctx->args->linear_centroid.arg_index] = linear_sample;
+   }
+
+   if (finfo->force_persp_center_interp) {
+      Temp persp_center = get_arg(ctx, ctx->args->persp_center);
+      ctx->arg_temps[ctx->args->persp_sample.arg_index] = persp_center;
+      ctx->arg_temps[ctx->args->persp_centroid.arg_index] = persp_center;
+   }
+
+   if (finfo->force_linear_center_interp) {
+      Temp linear_center = get_arg(ctx, ctx->args->linear_center);
+      ctx->arg_temps[ctx->args->linear_sample.arg_index] = linear_center;
+      ctx->arg_temps[ctx->args->linear_centroid.arg_index] = linear_center;
+   }
+}
+
+void
+overwrite_samplemask_arg(isel_context* ctx, const struct aco_ps_prolog_info* finfo)
+{
+   Builder bld(ctx->program, ctx->block);
+
+   /* Section 15.2.2 (Shader Inputs) of the OpenGL 4.5 (Core Profile) spec
+    * says:
+    *
+    *    "When per-sample shading is active due to the use of a fragment
+    *     input qualified by sample or due to the use of the gl_SampleID
+    *     or gl_SamplePosition variables, only the bit for the current
+    *     sample is set in gl_SampleMaskIn. When state specifies multiple
+    *     fragment shader invocations for a given fragment, the sample
+    *     mask for any single fragment shader invocation may specify a
+    *     subset of the covered samples for the fragment. In this case,
+    *     the bit corresponding to each covered sample will be set in
+    *     exactly one fragment shader invocation."
+    *
+    * The samplemask loaded by hardware is always the coverage of the
+    * entire pixel/fragment, so mask bits out based on the sample ID.
+    */
+   if (finfo->samplemask_log_ps_iter) {
+      Temp ancillary = get_arg(ctx, ctx->args->ancillary);
+      Temp sampleid = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), ancillary, Operand::c32(8u),
+                               Operand::c32(4u));
+      Temp samplemask = get_arg(ctx, ctx->args->sample_coverage);
+
+      uint32_t ps_iter_mask = ac_get_ps_iter_mask(1 << finfo->samplemask_log_ps_iter);
+      Temp iter_mask = bld.copy(bld.def(v1), Operand::c32(ps_iter_mask));
+
+      Temp mask = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), sampleid, iter_mask);
+      samplemask = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), samplemask, mask);
+
+      ctx->arg_temps[ctx->args->sample_coverage.arg_index] = samplemask;
+   }
+}
+
+Temp
+get_interp_color(isel_context* ctx, int interp_vgpr, unsigned attr_index, unsigned comp)
+{
+   Builder bld(ctx->program, ctx->block);
+
+   Temp dst = bld.tmp(v1);
+
+   Temp prim_mask = get_arg(ctx, ctx->args->prim_mask);
+
+   if (interp_vgpr != -1) {
+      /* interp args are all 2 vgprs */
+      int arg_index = ctx->args->persp_sample.arg_index + interp_vgpr / 2;
+      Temp interp_ij = ctx->arg_temps[arg_index];
+
+      emit_interp_instr(ctx, attr_index, comp, interp_ij, dst, prim_mask);
+   } else {
+      emit_interp_mov_instr(ctx, attr_index, comp, 0, dst, prim_mask);
+   }
+
+   return dst;
+}
+
+void
+interpolate_color_args(isel_context* ctx, const struct aco_ps_prolog_info* finfo,
+                       std::vector<Operand>& regs)
+{
+   if (!finfo->colors_read)
+      return;
+
+   Builder bld(ctx->program, ctx->block);
+
+   unsigned vgpr = 256 + ctx->args->num_vgprs_used;
+
+   if (finfo->color_two_side) {
+      Temp face = get_arg(ctx, ctx->args->front_face);
+      Temp is_face_positive =
+         bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand::zero(), face);
+
+      u_foreach_bit (i, finfo->colors_read) {
+         unsigned color_index = i / 4;
+         unsigned front_index = finfo->color_attr_index[color_index];
+         int interp_vgpr = finfo->color_interp_vgpr_index[color_index];
+
+         /* If BCOLOR0 is used, BCOLOR1 is at offset "num_inputs + 1",
+          * otherwise it's at offset "num_inputs".
+          */
+         unsigned back_index = finfo->num_interp_inputs;
+         if (color_index == 1 && finfo->colors_read & 0xf)
+            back_index++;
+
+         Temp front = get_interp_color(ctx, interp_vgpr, front_index, i % 4);
+         Temp back = get_interp_color(ctx, interp_vgpr, back_index, i % 4);
+
+         Temp color =
+            bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), back, front, is_face_positive);
+
+         regs.emplace_back(Operand(color, PhysReg{vgpr++}));
+      }
+   } else {
+      u_foreach_bit (i, finfo->colors_read) {
+         unsigned color_index = i / 4;
+         unsigned attr_index = finfo->color_attr_index[color_index];
+         int interp_vgpr = finfo->color_interp_vgpr_index[color_index];
+         Temp color = get_interp_color(ctx, interp_vgpr, attr_index, i % 4);
+
+         regs.emplace_back(Operand(color, PhysReg{vgpr++}));
+      }
+   }
+}
+
 } /* end namespace */
 
 void
@@ -12750,4 +12960,46 @@ select_gl_vs_prolog(Program* program, void* pinfo, ac_shader_config* config,
    finish_program(&ctx);
 }
 
+void
+select_ps_prolog(Program* program, void* pinfo, ac_shader_config* config,
+                 const struct aco_compiler_options* options, const struct aco_shader_info* info,
+                 const struct ac_shader_args* args)
+{
+   const struct aco_ps_prolog_info* finfo = (const struct aco_ps_prolog_info*)pinfo;
+   isel_context ctx =
+      setup_isel_context(program, 0, NULL, config, options, info, args, SWStage::FS);
+
+   ctx.block->fp_mode = program->next_fp_mode;
+
+   add_startpgm(&ctx);
+   append_logical_start(ctx.block);
+
+   if (finfo->poly_stipple)
+      emit_polygon_stipple(&ctx, finfo);
+
+   overwrite_interp_args(&ctx, finfo);
+
+   overwrite_samplemask_arg(&ctx, finfo);
+
+   std::vector<Operand> regs;
+   passthrough_all_args(&ctx, regs);
+
+   interpolate_color_args(&ctx, finfo, regs);
+
+   program->config->float_mode = program->blocks[0].fp_mode.val;
+
+   append_logical_end(ctx.block);
+
+   build_end_with_regs(&ctx, regs);
+
+   /* To compute all end args in WQM mode if required by main part. */
+   if (finfo->needs_wqm)
+      set_wqm(&ctx, true);
+
+   /* Exit WQM mode finally. */
+   program->needs_exact = true;
+
+   finish_program(&ctx);
+}
+
 } // namespace aco
index 4eefd18..0ca22ad 100644 (file)
@@ -415,3 +415,13 @@ aco_compile_gl_vs_prolog(const struct aco_compiler_options* options,
    aco_compile_shader_part(options, info, args, aco::select_gl_vs_prolog, (void*)pinfo,
                            build_prolog, binary, true);
 }
+
+void
+aco_compile_ps_prolog(const struct aco_compiler_options* options,
+                      const struct aco_shader_info* info, const struct aco_ps_prolog_info* pinfo,
+                      const struct ac_shader_args* args, aco_shader_part_callback* build_prolog,
+                      void** binary)
+{
+   aco_compile_shader_part(options, info, args, aco::select_ps_prolog, (void*)pinfo, build_prolog,
+                           binary, true);
+}
index e08b6b6..8f35e18 100644 (file)
@@ -89,6 +89,12 @@ void aco_compile_gl_vs_prolog(const struct aco_compiler_options* options,
                               const struct ac_shader_args* args,
                               aco_shader_part_callback* build_prolog, void** binary);
 
+void aco_compile_ps_prolog(const struct aco_compiler_options* options,
+                           const struct aco_shader_info* info,
+                           const struct aco_ps_prolog_info* pinfo,
+                           const struct ac_shader_args* args,
+                           aco_shader_part_callback* build_prolog, void** binary);
+
 uint64_t aco_get_codegen_flags();
 
 #ifdef __cplusplus
index 71effd8..ef35555 100644 (file)
@@ -2258,6 +2258,10 @@ void select_gl_vs_prolog(Program* program, void* pinfo, ac_shader_config* config
                          const struct aco_compiler_options* options,
                          const struct aco_shader_info* info, const struct ac_shader_args* args);
 
+void select_ps_prolog(Program* program, void* pinfo, ac_shader_config* config,
+                      const struct aco_compiler_options* options,
+                      const struct aco_shader_info* info, const struct ac_shader_args* args);
+
 void lower_phis(Program* program);
 void calc_min_waves(Program* program);
 void update_vgpr_sgpr_demand(Program* program, const RegisterDemand new_demand);
index c739100..1b49102 100644 (file)
@@ -104,6 +104,28 @@ struct aco_gl_vs_prolog_info {
    struct ac_arg internal_bindings;
 };
 
+struct aco_ps_prolog_info {
+   bool poly_stipple;
+   unsigned poly_stipple_buf_offset;
+
+   bool bc_optimize_for_persp;
+   bool bc_optimize_for_linear;
+   bool force_persp_sample_interp;
+   bool force_linear_sample_interp;
+   bool force_persp_center_interp;
+   bool force_linear_center_interp;
+
+   unsigned samplemask_log_ps_iter;
+   unsigned num_interp_inputs;
+   unsigned colors_read;
+   int color_interp_vgpr_index[2];
+   int color_attr_index[2];
+   bool color_two_side;
+   bool needs_wqm;
+
+   struct ac_arg internal_bindings;
+};
+
 struct aco_shader_info {
    enum ac_hw_stage hw_stage;
    uint8_t wave_size;