From 87d13ee73d3fa6e80902cf18b74ba6008a6eb501 Mon Sep 17 00:00:00 2001 From: Georg Lehmann Date: Fri, 4 Aug 2023 20:55:58 +0200 Subject: [PATCH] aco: combine a | ~b to bfi(b, a, -1) MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Somehow I missed this when writing the a & ~b patch. Foz-DB Navi21: Totals from 1591 (1.20% of 132657) affected shaders: Instrs: 2316379 -> 2315940 (-0.02%) CodeSize: 12524240 -> 12528724 (+0.04%); split: -0.00%, +0.04% Latency: 45393195 -> 45389285 (-0.01%); split: -0.01%, +0.00% InvThroughput: 8658991 -> 8657944 (-0.01%); split: -0.01%, +0.00% Copies: 135777 -> 135778 (+0.00%) Reviewed-by: Daniel Schürmann Part-of: --- src/amd/compiler/aco_optimizer.cpp | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/src/amd/compiler/aco_optimizer.cpp b/src/amd/compiler/aco_optimizer.cpp index 4c03cac..d74f795 100644 --- a/src/amd/compiler/aco_optimizer.cpp +++ b/src/amd/compiler/aco_optimizer.cpp @@ -3588,9 +3588,11 @@ combine_and_subbrev(opt_ctx& ctx, aco_ptr& instr) return false; } -/* v_and(a, not(b)) -> v_bfi_b32(b, 0, a) */ +/* v_and(a, not(b)) -> v_bfi_b32(b, 0, a) + * v_or(a, not(b)) -> v_bfi_b32(b, a, -1) + */ bool -combine_v_and_not(opt_ctx& ctx, aco_ptr& instr) +combine_v_andor_not(opt_ctx& ctx, aco_ptr& instr) { if (instr->usesModifiers()) return false; @@ -3606,15 +3608,20 @@ combine_v_and_not(opt_ctx& ctx, aco_ptr& instr) Operand::zero(), instr->operands[!i], }; + if (instr->opcode == aco_opcode::v_or_b32) { + ops[1] = instr->operands[!i]; + ops[2] = Operand::c32(-1); + } if (!check_vop3_operands(ctx, 3, ops)) continue; Instruction* new_instr = create_instruction(aco_opcode::v_bfi_b32, Format::VOP3, 3, 1); - new_instr->operands[0] = copy_operand(ctx, op_instr->operands[0]); - new_instr->operands[1] = Operand::zero(); - new_instr->operands[2] = instr->operands[!i]; + if (op_instr->operands[0].isTemp()) + ctx.uses[op_instr->operands[0].tempId()]++; + for (unsigned j = 0; j < 3; j++) + new_instr->operands[j] = ops[j]; new_instr->definitions[0] = instr->definitions[0]; new_instr->pass_flags = instr->pass_flags; instr.reset(new_instr); @@ -4419,6 +4426,7 @@ combine_instruction(opt_ctx& ctx, aco_ptr& instr) } else if (combine_three_valu_op(ctx, instr, aco_opcode::v_or_b32, aco_opcode::v_or3_b32, "012", 1 | 2)) { } else if (combine_add_or_then_and_lshl(ctx, instr)) { + } else if (combine_v_andor_not(ctx, instr)) { } } else if (instr->opcode == aco_opcode::v_xor_b32 && ctx.program->gfx_level >= GFX10) { if (combine_three_valu_op(ctx, instr, aco_opcode::v_xor_b32, aco_opcode::v_xor3_b32, "012", @@ -4496,7 +4504,7 @@ combine_instruction(opt_ctx& ctx, aco_ptr& instr) combine_sabsdiff(ctx, instr); } else if (instr->opcode == aco_opcode::v_and_b32) { if (combine_and_subbrev(ctx, instr)) { - } else if (combine_v_and_not(ctx, instr)) { + } else if (combine_v_andor_not(ctx, instr)) { } } else if (instr->opcode == aco_opcode::v_fma_f32 || instr->opcode == aco_opcode::v_fma_f16) { /* set existing v_fma_f32 with label_mad so we can create v_fmamk_f32/v_fmaak_f32. -- 2.7.4