return true;
}
+void select_mul_u32_u24(opt_ctx &ctx, aco_ptr<Instruction>& instr)
+{
+ if (instr->usesModifiers())
+ return;
+
+ /* Only valid if the accumulator is zero (this is selected by isel to
+ * combine more v_add_u32+v_mad_u32_u16 together), but the optimizer
+ * fallbacks here when not possible.
+ */
+ if (!instr->operands[2].constantEquals(0))
+ return;
+
+ /* Only valid if the upper 16-bits of both operands are zero (because
+ * v_mul_u32_u24 doesn't mask them).
+ */
+ for (unsigned i = 0; i < 2; i++) {
+ if (instr->operands[i].isTemp() && !instr->operands[i].is16bit())
+ return;
+ }
+
+ bool swap = false;
+
+ /* VOP2 instructions can only take constants/sgprs in operand 0. */
+ if ((instr->operands[1].isConstant() ||
+ (instr->operands[1].hasRegClass() &&
+ instr->operands[1].regClass().type() == RegType::sgpr))) {
+ swap = true;
+ if ((instr->operands[0].isConstant() ||
+ (instr->operands[0].hasRegClass() &&
+ instr->operands[0].regClass().type() == RegType::sgpr))) {
+ /* VOP2 can't take both constants/sgprs, keep v_mad_u32_u16 because
+ * v_mul_u32_u24 has no advantages.
+ */
+ return;
+ }
+ }
+
+ VOP2_instruction *new_instr = create_instruction<VOP2_instruction>(aco_opcode::v_mul_u32_u24, Format::VOP2, 2, 1);
+ new_instr->operands[0] = instr->operands[swap];
+ new_instr->operands[1] = instr->operands[!swap];
+ new_instr->definitions[0] = instr->definitions[0];
+ instr.reset(new_instr);
+}
+
void select_instruction(opt_ctx &ctx, aco_ptr<Instruction>& instr)
{
const uint32_t threshold = 4;
return;
}
+ if (instr->opcode == aco_opcode::v_mad_u32_u16)
+ select_mul_u32_u24(ctx, instr);
+
if (instr->isSDWA() || instr->isDPP() || (instr->isVOP3() && ctx.program->chip_class < GFX10))
return; /* some encodings can't ever take literals */
finish_opt_test();
}
END_TEST
+
+Temp create_mad_u32_u16(Operand a, Operand b, Operand c, bool is16bit = true)
+{
+ a.set16bit(is16bit);
+ b.set16bit(is16bit);
+
+ return bld.vop3(aco_opcode::v_mad_u32_u16, bld.def(v1), a, b, c);
+}
+
+BEGIN_TEST(optimize.mad_u32_u16)
+ for (unsigned i = GFX9; i <= GFX10; i++) {
+ //>> v1: %a, v1: %b, s1: %c, s2: %_:exec = p_startpgm
+ if (!setup_cs("v1 v1 s1", (chip_class)i))
+ continue;
+
+ //! v1: %res0 = v_mul_u32_u24 (is16bit)%a, (is16bit)%b
+ //! p_unit_test 0, %res0
+ writeout(0, create_mad_u32_u16(Operand(inputs[0]), Operand(inputs[1]), Operand(0u)));
+
+ //! v1: %res1 = v_mul_u32_u24 42, (is16bit)%a
+ //! p_unit_test 1, %res1
+ writeout(1, create_mad_u32_u16(Operand(42u), Operand(inputs[0]), Operand(0u)));
+
+ //! v1: %res2 = v_mul_u32_u24 42, (is16bit)%a
+ //! p_unit_test 2, %res2
+ writeout(2, create_mad_u32_u16(Operand(inputs[0]), Operand(42u), Operand(0u)));
+
+ //! v1: %res3 = v_mul_u32_u24 (is16bit)%c, (is16bit)%a
+ //! p_unit_test 3, %res3
+ writeout(3, create_mad_u32_u16(Operand(inputs[2]), Operand(inputs[0]), Operand(0u)));
+
+ //! v1: %res4 = v_mad_u32_u16 42, (is16bit)%c, 0
+ //! p_unit_test 4, %res4
+ writeout(4, create_mad_u32_u16(Operand(42u), Operand(inputs[2]), Operand(0u)));
+
+ //! v1: %res5 = v_mad_u32_u16 42, %a, 0
+ //! p_unit_test 5, %res5
+ writeout(5, create_mad_u32_u16(Operand(42u), Operand(inputs[0]), Operand(0u), false));
+
+ finish_opt_test();
+ }
+END_TEST