From 2ae94b38943f36cd02bddcf40d87063c676dcb4d Mon Sep 17 00:00:00 2001 From: Georg Lehmann Date: Tue, 8 Aug 2023 13:35:18 +0200 Subject: [PATCH] aco: implement some exclusive scans with inclusive scans MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit exclusive scan lowering uses full wave shift, for iadd/ixor it's faster to do inclusive scans and subtract/xor the thread's source. Foz-DB Navi21: Totals from 21 (0.02% of 132657) affected shaders: Instrs: 10925 -> 10727 (-1.81%) CodeSize: 58064 -> 56488 (-2.71%) Latency: 178471 -> 177928 (-0.30%) InvThroughput: 24374 -> 24145 (-0.94%) Reviewed-by: Timur Kristóf Part-of: --- src/amd/compiler/aco_instruction_selection.cpp | 47 ++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp index e72882e..baad7de 100644 --- a/src/amd/compiler/aco_instruction_selection.cpp +++ b/src/amd/compiler/aco_instruction_selection.cpp @@ -7916,6 +7916,42 @@ emit_reduction_instr(isel_context* ctx, aco_opcode aco_op, ReduceOp op, unsigned return dst.getTemp(); } +Temp +inclusive_scan_to_exclusive(isel_context* ctx, ReduceOp op, Temp scan, Temp src) +{ + Builder bld(ctx->program, ctx->block); + + switch (op) { + case iadd8: + case iadd16: + case iadd32: return bld.vsub32(bld.def(scan.regClass()), scan, src); + case ixor64: + case iadd64: { + Temp src00 = bld.tmp(v1); + Temp src01 = bld.tmp(v1); + bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), scan); + Temp src10 = bld.tmp(v1); + Temp src11 = bld.tmp(v1); + bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src); + + Temp lower = bld.tmp(v1); + Temp upper = bld.tmp(v1); + if (op == iadd64) { + Temp borrow = bld.vsub32(Definition(lower), src00, src10, true).def(1).getTemp(); + bld.vsub32(Definition(upper), src01, src11, false, borrow); + } else { + bld.vop2(aco_opcode::v_xor_b32, Definition(lower), src00, src10); + bld.vop2(aco_opcode::v_xor_b32, Definition(upper), src01, src11); + } + return bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), lower, upper); + } + case ixor8: + case ixor16: + case ixor32: return bld.vop2(aco_opcode::v_xor_b32, bld.def(scan.regClass()), scan, src); + default: unreachable("Unsupported op"); + } +} + void emit_interp_center(isel_context* ctx, Temp dst, Temp bary, Temp pos1, Temp pos2) { @@ -8453,8 +8489,19 @@ visit_intrinsic(isel_context* ctx, nir_intrinsic_instr* instr) default: unreachable("unknown reduce intrinsic"); } + /* Avoid whole wave shift. */ + const bool use_inclusive_for_exclusive = aco_op == aco_opcode::p_exclusive_scan && + (op == nir_op_iadd || op == nir_op_ixor) && + dst.type() == RegType::vgpr; + if (use_inclusive_for_exclusive) + aco_op = aco_opcode::p_inclusive_scan; + Temp tmp_dst = emit_reduction_instr(ctx, aco_op, reduce_op, cluster_size, bld.def(dst.regClass()), src); + + if (use_inclusive_for_exclusive) + tmp_dst = inclusive_scan_to_exclusive(ctx, reduce_op, tmp_dst, src); + emit_wqm(bld, tmp_dst, dst, create_helpers); } break; -- 2.7.4