uint32_t src0_ub = get_alu_src_ub(ctx, instr, 0);
uint32_t src1_ub = get_alu_src_ub(ctx, instr, 1);
- if (src0_ub <= 0xffffff && src1_ub <= 0xffffff) {
+ if (src0_ub <= 0xffff && src1_ub <= 0xffff &&
+ ctx->options->chip_class >= GFX9) {
+ /* Initialize the accumulator to 0 to allow further combinations
+ * in the optimizer.
+ */
+ Operand op0(get_alu_src(ctx, instr->src[0]));
+ Operand op1(get_alu_src(ctx, instr->src[1]));
+ bld.vop3(aco_opcode::v_mad_u32_u16, Definition(dst), bld.set16bit(op0), bld.set16bit(op1), Operand(0u));
+ } else if (src0_ub <= 0xffffff && src1_ub <= 0xffffff) {
emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_u32_u24, dst, true);
} else {
emit_vop3a_instruction(ctx, instr, aco_opcode::v_mul_lo_u32, dst);