Totals from 2504 (3.27% of 76572) affected shaders:
MaxWaves: 74098 -> 74106 (+0.01%)
Instrs: 1829278 -> 1823427 (-0.32%); split: -0.32%, +0.00%
CodeSize: 9775908 -> 9759308 (-0.17%); split: -0.18%, +0.01%
Latency:
13494107 ->
13485390 (-0.06%); split: -0.10%, +0.04%
InvThroughput: 2052428 -> 2048724 (-0.18%); split: -0.18%, +0.00%
VClause: 26637 -> 26640 (+0.01%); split: -0.04%, +0.05%
SClause: 62027 -> 61988 (-0.06%); split: -0.14%, +0.08%
Copies: 73776 -> 73815 (+0.05%); split: -0.07%, +0.12%
PreVGPRs: 84403 -> 84397 (-0.01%)
Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/25220>
(0x05, "v_interp_p2_rtz_f16_f32_inreg"),
}
for (code, name) in VINTERP:
- opcode(name, -1, -1, -1, code, Format.VINTERP_INREG, InstrClass.Valu32)
+ opcode(name, -1, -1, -1, code, Format.VINTERP_INREG, InstrClass.Valu32, False, True)
# VOP3 instructions: 3 inputs, 1 output
}
}
+void
+interp_p2_f32_inreg_to_fma_dpp(aco_ptr<Instruction>& instr)
+{
+ static_assert(sizeof(DPP16_instruction) == sizeof(VINTERP_inreg_instruction),
+ "Invalid instr cast.");
+ instr->format = asVOP3(Format::DPP16);
+ instr->opcode = aco_opcode::v_fma_f32;
+ instr->dpp16().dpp_ctrl = dpp_quad_perm(2, 2, 2, 2);
+ instr->dpp16().row_mask = 0xf;
+ instr->dpp16().bank_mask = 0xf;
+ instr->dpp16().bound_ctrl = 0;
+ instr->dpp16().fetch_inactive = 1;
+}
+
/* apply omod / clamp modifiers if the def is used only once and the instruction can have modifiers */
bool
apply_omod_clamp(opt_ctx& ctx, aco_ptr<Instruction>& instr)
bool can_vop3 = can_use_VOP3(ctx, instr);
bool is_mad_mix =
instr->opcode == aco_opcode::v_fma_mix_f32 || instr->opcode == aco_opcode::v_fma_mixlo_f16;
- if (!instr->isSDWA() && !is_mad_mix && !can_vop3)
+ bool needs_vop3 = !instr->isSDWA() && !instr->isVINTERP_INREG() && !is_mad_mix;
+ if (needs_vop3 && !can_vop3)
return false;
/* SDWA omod is GFX9+. */
- bool can_use_omod = (can_vop3 || ctx.program->gfx_level >= GFX9) && !instr->isVOP3P();
+ bool can_use_omod =
+ (can_vop3 || ctx.program->gfx_level >= GFX9) && !instr->isVOP3P() &&
+ (!instr->isVINTERP_INREG() || instr->opcode == aco_opcode::v_interp_p2_f32_inreg);
ssa_info& def_info = ctx.info[instr->definitions[0].tempId()];
/* MADs/FMAs are created later, so we don't have to update the original add */
assert(!ctx.info[instr->definitions[0].tempId()].is_mad());
- if (!instr->isSDWA() && !instr->isVOP3P())
- instr->format = asVOP3(instr->format);
-
if (!def_info.is_clamp() && (instr->valu().clamp || instr->valu().omod))
return false;
+ if (needs_vop3)
+ instr->format = asVOP3(instr->format);
+
+ if (!def_info.is_clamp() && instr->opcode == aco_opcode::v_interp_p2_f32_inreg)
+ interp_p2_f32_inreg_to_fma_dpp(instr);
+
if (def_info.is_omod2())
instr->valu().omod = 1;
else if (def_info.is_omod4())
finish_opt_test();
END_TEST
+
+BEGIN_TEST(optimize.vinterp_inreg_output_modifiers)
+ //>> v1: %a, v1: %b, v1: %c = p_startpgm
+ if (!setup_cs("v1 v1 v1", GFX11))
+ return;
+
+ //! v1: %res0 = v_interp_p2_f32_inreg %a, %b, %c clamp
+ //! p_unit_test 0, %res0
+ Temp tmp = bld.vinterp_inreg(aco_opcode::v_interp_p2_f32_inreg, bld.def(v1), inputs[0],
+ inputs[1], inputs[2]);
+ writeout(0, fsat(tmp));
+
+ //! v1: %res1 = v_fma_f32 %b, %a, %c *2 quad_perm:[2,2,2,2] fi
+ //! p_unit_test 1, %res1
+ tmp = bld.vinterp_inreg(aco_opcode::v_interp_p2_f32_inreg, bld.def(v1), inputs[1], inputs[0],
+ inputs[2]);
+ tmp = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(0x40000000u), tmp);
+ writeout(1, tmp);
+
+ //! v2b: %res2 = v_interp_p2_f16_f32_inreg %a, %b, %c clamp
+ //! p_unit_test 2, %res2
+ tmp = bld.vinterp_inreg(aco_opcode::v_interp_p2_f16_f32_inreg, bld.def(v2b), inputs[0],
+ inputs[1], inputs[2]);
+ writeout(2, fsat(tmp));
+
+ //! v2b: %tmp3 = v_interp_p2_f16_f32_inreg %b, %a, %c
+ //! v2b: %res3 = v_mul_f16 2.0, %tmp3
+ //! p_unit_test 3, %res3
+ tmp = bld.vinterp_inreg(aco_opcode::v_interp_p2_f16_f32_inreg, bld.def(v2b), inputs[1],
+ inputs[0], inputs[2]);
+ tmp = bld.vop2(aco_opcode::v_mul_f16, bld.def(v2b), Operand::c16(0x4000u), tmp);
+ writeout(3, tmp);
+
+ finish_opt_test();
+END_TEST