From 20e48551acff92f07109630e134f59c81107e6bc Mon Sep 17 00:00:00 2001 From: Samuel Pitoiset Date: Mon, 2 Nov 2020 15:15:04 +0100 Subject: [PATCH] aco: select v_mul_lo_u16 for 16-bit multiplications that can't overflow MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Only on GFX8-9 because GFX10 doesn't zero the upper 16 bits. No fossils-db changes. Signed-off-by: Samuel Pitoiset Reviewed-by: Timur Kristóf Reviewed-by: Rhys Perry Part-of: --- src/amd/compiler/aco_instruction_selection.cpp | 26 ++++++++++++++++++++++---- 1 file changed, 22 insertions(+), 4 deletions(-) diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp index 3bae0a8..ddf94bf 100644 --- a/src/amd/compiler/aco_instruction_selection.cpp +++ b/src/amd/compiler/aco_instruction_selection.cpp @@ -732,7 +732,8 @@ void emit_sop2_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode o } void emit_vop2_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode op, Temp dst, - bool commutative, bool swap_srcs=false, bool flush_denorms = false) + bool commutative, bool swap_srcs=false, + bool flush_denorms = false, bool nuw = false) { Builder bld(ctx->program, ctx->block); bld.is_precise = instr->exact; @@ -754,7 +755,11 @@ void emit_vop2_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode o Temp tmp = bld.vop2(op, bld.def(v1), src0, src1); bld.vop2(aco_opcode::v_mul_f32, Definition(dst), Operand(0x3f800000u), tmp); } else { - bld.vop2(op, Definition(dst), src0, src1); + if (nuw) { + bld.nuw().vop2(op, Definition(dst), src0, src1); + } else { + bld.vop2(op, Definition(dst), src0, src1); + } } } @@ -1721,16 +1726,29 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) } else if (dst.bytes() <= 2 && ctx->program->chip_class >= GFX8) { emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_lo_u16, dst, true); } else if (dst.type() == RegType::vgpr) { + Temp src0 = get_alu_src(ctx, instr->src[0]); + Temp src1 = get_alu_src(ctx, instr->src[1]); uint32_t src0_ub = get_alu_src_ub(ctx, instr, 0); uint32_t src1_ub = get_alu_src_ub(ctx, instr, 1); if (src0_ub <= 0xffff && src1_ub <= 0xffff && + src0_ub * src1_ub <= 0xffff && + (ctx->options->chip_class == GFX8 || + ctx->options->chip_class == GFX9)) { + /* If the 16-bit multiplication can't overflow, emit v_mul_lo_u16 + * but only on GFX8-9 because GFX10 doesn't zero the upper 16 + * bits. + */ + emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_lo_u16, dst, + true /* commutative */, false, false, + true /* nuw */); + } else if (src0_ub <= 0xffff && src1_ub <= 0xffff && ctx->options->chip_class >= GFX9) { /* Initialize the accumulator to 0 to allow further combinations * in the optimizer. */ - Operand op0(get_alu_src(ctx, instr->src[0])); - Operand op1(get_alu_src(ctx, instr->src[1])); + Operand op0(src0); + Operand op1(src1); bld.vop3(aco_opcode::v_mad_u32_u16, Definition(dst), bld.set16bit(op0), bld.set16bit(op1), Operand(0u)); } else if (src0_ub <= 0xffffff && src1_ub <= 0xffffff) { emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_u32_u24, dst, true); -- 2.7.4